My Project
Loading...
Searching...
No Matches
Ewald_Energy_Functions.h
1#include <omp.h>
2#include "ewald_preparation.h"
3inline void checkCUDAErrorEwald(const char *msg)
4{
5 cudaError_t err = cudaGetLastError();
6 if( cudaSuccess != err)
7 {
8 printf("CUDA Error: %s: %s.\n", msg, cudaGetErrorString(err) );
9 exit(EXIT_FAILURE);
10 }
11}
12/*
13static inline void Setup_threadblock_EW(size_t arraysize, size_t *Nblock, size_t *Nthread)
14{
15 size_t value = arraysize;
16 if(value >= 128) value = 128;
17 double ratio = (double)arraysize/value;
18 size_t blockValue = ceil(ratio);
19 if(blockValue == 0) blockValue++;
20 *Nthread = value;
21 *Nblock = blockValue;
22}
23*/
24
26// General Functions for User-defined Complex Variables //
28__device__ double ComplexNorm(Complex a)
29{
30 return a.real * a.real + a.imag * a.imag;
31}
32
33__device__ Complex multiply(Complex a, Complex b) //a*b = c for complex numbers//
34{
35 Complex c;
36 c.real = a.real*b.real - a.imag*b.imag;
37 c.imag = a.real*b.imag + a.imag*b.real;
38 return c;
39}
40
41__device__ void Initialize_Vectors(Boxsize Box, size_t Oldsize, size_t Newsize, Atoms Old, size_t numberOfAtoms, int3 kmax)
42{
43 int kx_max = kmax.x;
44 int ky_max = kmax.y;
45 int kz_max = kmax.z;
46 //Old//
47 Complex tempcomplex; tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
48 for(size_t posi=0; posi < Oldsize; ++posi)
49 {
50 tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
51 double3 pos = Old.pos[posi];
52 Box.eik_x[posi + 0 * numberOfAtoms] = tempcomplex;
53 Box.eik_y[posi + 0 * numberOfAtoms] = tempcomplex;
54 Box.eik_z[posi + 0 * numberOfAtoms] = tempcomplex;
55 double3 s; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
56 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[posi + 1 * numberOfAtoms] = tempcomplex;
57 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[posi + 1 * numberOfAtoms] = tempcomplex;
58 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[posi + 1 * numberOfAtoms] = tempcomplex;
59 }
60 //New//
61 for(size_t posi=Oldsize; posi < Oldsize + Newsize; ++posi)
62 {
63 tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
64 double3 pos = Old.pos[posi];
65 Box.eik_x[posi + 0 * numberOfAtoms] = tempcomplex;
66 Box.eik_y[posi + 0 * numberOfAtoms] = tempcomplex;
67 Box.eik_z[posi + 0 * numberOfAtoms] = tempcomplex;
68 double3 s ; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
69 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[posi + 1 * numberOfAtoms] = tempcomplex;
70 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[posi + 1 * numberOfAtoms] = tempcomplex;
71 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[posi + 1 * numberOfAtoms] = tempcomplex;
72 }
73 // Calculate remaining positive kx, ky and kz by recurrence
74 for(size_t kx = 2; kx <= kx_max; ++kx)
75 {
76 for(size_t i = 0; i != numberOfAtoms; ++i)
77 {
78 Box.eik_x[i + kx * numberOfAtoms] = multiply(Box.eik_x[i + (kx - 1) * numberOfAtoms], Box.eik_x[i + 1 * numberOfAtoms]);
79 }
80 }
81 for(size_t ky = 2; ky <= ky_max; ++ky)
82 {
83 for(size_t i = 0; i != numberOfAtoms; ++i)
84 {
85 Box.eik_y[i + ky * numberOfAtoms] = multiply(Box.eik_y[i + (ky - 1) * numberOfAtoms], Box.eik_y[i + 1 * numberOfAtoms]);
86 }
87 }
88 for(size_t kz = 2; kz <= kz_max; ++kz)
89 {
90 for(size_t i = 0; i != numberOfAtoms; ++i)
91 {
92 Box.eik_z[i + kz * numberOfAtoms] = multiply(Box.eik_z[i + (kz - 1) * numberOfAtoms], Box.eik_z[i + 1 * numberOfAtoms]);
93 }
94 }
95}
96
97__global__ void Initialize_EwaldVector_General(Boxsize Box, int3 kmax, Atoms* d_a, Atoms New, Atoms Old, size_t Oldsize, size_t Newsize, size_t SelectedComponent, size_t Location, size_t chainsize, size_t numberOfAtoms, int MoveType)
98{
99 //Zhao's note: need to think about changing this boolean to switch//
100 if(MoveType == TRANSLATION || MoveType == ROTATION || MoveType == SPECIAL_ROTATION || MoveType == SINGLE_INSERTION || MoveType == SINGLE_DELETION) // Translation/Rotation/single_insertion/single_deletion //
101 {
102 //For Translation/Rotation, the Old positions are already in the Old struct, just need to put the New positions into Old, after the Old positions//
103 for(size_t i = Oldsize; i < Oldsize + Newsize; i++) //chainsize here is the total size of the molecule for translation/rotation
104 {
105 Old.pos[i] = New.pos[i - Oldsize];
106 Old.scale[i] = New.scale[i - Oldsize];
107 Old.charge[i] = New.charge[i - Oldsize];
108 Old.scaleCoul[i] = New.scaleCoul[i - Oldsize];
109 }
110 }
111 else if(MoveType == INSERTION || MoveType == CBCF_INSERTION) // Insertion & Fractional Insertion //
112 {
113 //Put the trial orientations in New to Old, right after the first bead position//
114 for(size_t i = 0; i < chainsize; i++)
115 {
116 Old.pos[i + 1] = New.pos[Location * chainsize + i];
117 Old.scale[i + 1] = New.scale[Location * chainsize + i];
118 Old.charge[i + 1] = New.charge[Location * chainsize + i];
119 Old.scaleCoul[i + 1] = New.scaleCoul[Location * chainsize + i];
120 }
121 }
122 else if(MoveType == DELETION || MoveType == CBCF_DELETION) // Deletion //
123 {
124 for(size_t i = 0; i < Oldsize; i++)
125 {
126 // For deletion, Location = UpdateLocation, see Deletion Move //
127 Old.pos[i] = d_a[SelectedComponent].pos[Location + i];
128 Old.scale[i] = d_a[SelectedComponent].scale[Location + i];
129 Old.charge[i] = d_a[SelectedComponent].charge[Location + i];
130 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[Location + i];
131 }
132 }
133 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
134}
135
136__global__ void Initialize_EwaldVector_LambdaChange(Boxsize Box, int3 kmax, Atoms* d_a, Atoms Old, size_t Oldsize, double2 newScale)
137{
138 Initialize_Vectors(Box, Oldsize, 0, Old, Oldsize, kmax);
139}
140
141__global__ void Initialize_EwaldVector_Reinsertion(Boxsize Box, int3 kmax, double3* temp, Atoms* d_a, Atoms Old, size_t Oldsize, size_t Newsize, size_t realpos, size_t numberOfAtoms, size_t SelectedComponent)
142{
143 for(size_t i = 0; i < Oldsize; i++)
144 {
145 Old.pos[i] = d_a[SelectedComponent].pos[realpos + i];
146 Old.scale[i] = d_a[SelectedComponent].scale[realpos + i];
147 Old.charge[i] = d_a[SelectedComponent].charge[realpos + i];
148 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[realpos + i];
149 }
150 //Reinsertion New Positions stored in three arrays, other data are the same as the Old molecule information in d_a//
151 for(size_t i = Oldsize; i < Oldsize + Newsize; i++) //chainsize here is the total size of the molecule for translation/rotation
152 {
153 Old.pos[i] = temp[i - Oldsize];
154 Old.scale[i] = d_a[SelectedComponent].scale[realpos + i - Oldsize];
155 Old.charge[i] = d_a[SelectedComponent].charge[realpos + i - Oldsize];
156 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[realpos + i - Oldsize];
157 }
158 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
159}
160
161__global__ void Initialize_EwaldVector_IdentitySwap(Boxsize Box, int3 kmax, double3* temp, Atoms* d_a, Atoms Old, size_t Oldsize, size_t Newsize, size_t realpos, size_t numberOfAtoms, size_t OLDComponent, size_t NEWComponent)
162{
163 for(size_t i = 0; i < Oldsize; i++)
164 {
165 Old.pos[i] = d_a[OLDComponent].pos[realpos + i];
166 Old.scale[i] = d_a[OLDComponent].scale[realpos + i];
167 Old.charge[i] = d_a[OLDComponent].charge[realpos + i];
168 Old.scaleCoul[i] = d_a[OLDComponent].scaleCoul[realpos + i];
169 }
170 //IdentitySwap New Positions stored in three arrays, other data are the same as the Old molecule information in d_a//
171 //Zhao's note: assuming not performing identity swap on fractional molecules//
172 for(size_t i = Oldsize; i < Oldsize + Newsize; i++) //chainsize here is the total size of the molecule for translation/rotation
173 {
174 Old.pos[i] = temp[i - Oldsize];
175 Old.scale[i] = 1.0;
176 Old.charge[i] = d_a[NEWComponent].charge[i - Oldsize];
177 Old.scaleCoul[i] = 1.0;
178 }
179 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
180}
181
182__global__ void JustStore_Ewald(Boxsize Box, size_t nvec)
183{
184 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
185 if(i < nvec) Box.tempEik[i] = Box.AdsorbateEik[i];
186}
187
189// CALCULATE FOURIER PART OF THE COULOMBIC ENERGY FOR A MOVE //
191
192__global__ void Fourier_Ewald_Diff(Boxsize Box, Complex* SameTypeEik, Complex* CrossTypeEik, Atoms Old, double alpha_squared, double prefactor, int3 kmax, size_t Oldsize, size_t Newsize, double* Blocksum, bool UseTempVector, size_t Nblock)
193{
194 //Zhao's note: provide an additional Nblock to distinguish Host-Guest and Guest-Guest Ewald//
195 //Guest-Guest is the first half, Host-Guest is the second//
196 extern __shared__ double sdata[]; //shared memory for partial sum//
197 size_t kxyz = blockIdx.x * blockDim.x + threadIdx.x;
198 int cache_id = threadIdx.x;
199 size_t i_within_block = kxyz - blockIdx.x * blockDim.x; //for recording the position of the thread within a block
200 double tempE = 0.0;
201 size_t kx_max = kmax.x;
202 size_t ky_max = kmax.y;
203 size_t kz_max = kmax.z;
204 size_t nvec = (kx_max + 1) * (2 * ky_max + 1) * (2 * kz_max + 1);
205
206 if(blockIdx.x >= Nblock) kxyz -= blockDim.x * Nblock; //If Host-Guest Interaction, adjust kxyz//
207 if(kxyz < nvec)
208 {
209 //Box.tempEik[kxyz] = Box.AdsorbateEik[kxyz];
210 sdata[i_within_block] = 0.0;
211 int kz = kxyz%(2 * kz_max + 1) - kz_max;
212 int kxy = kxyz/(2 * kz_max + 1);
213 int kx = kxy /(2 * ky_max + 1);
214 int ky = kxy %(2 * ky_max + 1) - ky_max;
215 double ksqr = static_cast<double>(kx * kx + ky * ky + kz * kz);
216 if(Box.UseLAMMPSEwald) //Overwrite ksqr if we use the LAMMPS Setup for Ewald//
217 {
218 const double lx = Box.Cell[0];
219 const double ly = Box.Cell[4];
220 const double lz = Box.Cell[8];
221 const double xy = Box.Cell[3];
222 const double xz = Box.Cell[6];
223 const double yz = Box.Cell[7];
224 const double ux = 2*M_PI/lx;
225 const double uy = 2*M_PI*(-xy)/lx/ly;
226 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
227 const double vy = 2*M_PI/ly;
228 const double vz = 2*M_PI*(-yz)/ly/lz;
229 const double wz = 2*M_PI/lz;
230 const double kvecx = kx*ux;
231 const double kvecy = kx*uy + ky*vy;
232 const double kvecz = kx*uz + ky*vz + kz*wz;
233 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
234 }
235 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
236 {
237 double3 ax; ax.x = Box.InverseCell[0]; ax.y = Box.InverseCell[3]; ax.z = Box.InverseCell[6];
238 double3 ay; ay.x = Box.InverseCell[1]; ay.y = Box.InverseCell[4]; ay.z = Box.InverseCell[7];
239 double3 az; az.x = Box.InverseCell[2]; az.y = Box.InverseCell[5]; az.z = Box.InverseCell[8];
240 size_t numberOfAtoms = Oldsize + Newsize;
241 Complex cksum_old; cksum_old.real = 0.0; cksum_old.imag = 0.0;
242 Complex cksum_new; cksum_new.real = 0.0; cksum_new.imag = 0.0;
243 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
244 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
245 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
246 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
247 //OLD//
248 for(size_t i=0; i<Oldsize + Newsize; ++i)
249 {
250 Complex eik_temp = Box.eik_y[i + numberOfAtoms * static_cast<size_t>(std::abs(ky))];
251 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
252 Complex eik_xy = multiply(Box.eik_x[i + numberOfAtoms * static_cast<size_t>(kx)], eik_temp);
253
254 eik_temp = Box.eik_z[i + numberOfAtoms * static_cast<size_t>(std::abs(kz))];
255 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
256 double charge = Old.charge[i];
257 double scaling = Old.scaleCoul[i];
258 Complex tempi = multiply(eik_xy, eik_temp);
259 if(i < Oldsize)
260 {
261 cksum_old.real += scaling * charge * tempi.real;
262 cksum_old.imag += scaling * charge * tempi.imag;
263 }
264 else
265 {
266 cksum_new.real += scaling * charge * tempi.real;
267 cksum_new.imag += scaling * charge * tempi.imag;
268 }
269 }
270 double3 tempkvec = kvec_x + kvec_y + kvec_z;
271 double rksq = dot(tempkvec, tempkvec);
272 double temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
273 Complex newV; Complex OldV;
274 //Zhao's note: this is for CBCF insertion, where insertion is the second step. The intermediate Eik is tempEik, is the one we should use//
275 if(blockIdx.x < Nblock) //Same Type, do the normal Ewald//
276 {
277 if(UseTempVector)
278 {
279 OldV.real = Box.tempEik[kxyz].real; OldV.imag = Box.tempEik[kxyz].imag;
280 }
281 else
282 {
283 OldV.real = SameTypeEik[kxyz].real; OldV.imag = SameTypeEik[kxyz].imag;
284 }
285 newV.real = OldV.real + cksum_new.real - cksum_old.real;
286 newV.imag = OldV.imag + cksum_new.imag - cksum_old.imag;
287 tempE += temp * ComplexNorm(newV);
288 tempE -= temp * ComplexNorm(OldV);
289 Box.tempEik[kxyz] = newV; //Guest-Guest, do the normal Ewald, update the wave vector//
290 }
291 else //Host-Guest//
292 {
293 OldV.real = CrossTypeEik[kxyz].real; OldV.imag = CrossTypeEik[kxyz].imag;
294 tempE += temp * (OldV.real * (cksum_new.real - cksum_old.real) + OldV.imag * (cksum_new.imag - cksum_old.imag));
295 }
296 }
297 }
298 sdata[i_within_block] = tempE;
299 __syncthreads();
300 //Partial block sum//
301 int i=blockDim.x / 2;
302 while(i != 0)
303 {
304 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
305 __syncthreads();
306 i /= 2;
307 }
308 if(cache_id == 0) {Blocksum[blockIdx.x] = sdata[0];}
309}
310
311void Skip_Ewald(Boxsize& Box)
312{
313 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
314 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
315 JustStore_Ewald<<<Nblock, Nthread>>>(Box, numberOfWaveVectors);
316}
317
318__global__ void Update_Ewald_Stored(Complex* Eik, Complex* Temp_Eik, size_t nvec)
319{
320 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
321 if(i < nvec) Eik[i] = Temp_Eik[i];
322}
323void Update_Ewald_Vector(Boxsize& Box, bool CPU, Components& SystemComponents, size_t SelectedComponent)
324{
325 //else //Update on the GPU//
326 //{
327 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
328 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
329 Complex* Stored_Eik;
330 if(SelectedComponent < SystemComponents.NComponents.y)
331 {
332 Stored_Eik = Box.FrameworkEik;
333 }
334 else
335 {
336 Stored_Eik = Box.AdsorbateEik;
337 }
338 Update_Ewald_Stored<<<Nblock, Nthread>>>(Stored_Eik, Box.tempEik, numberOfWaveVectors);
339 //}
340}
341
343// Main Ewald Functions (Fourier + Exclusion) //
345double2 GPU_EwaldDifference_General(Boxsize& Box, Atoms*& d_a, Atoms& New, Atoms& Old, ForceField& FF, double* Blocksum, Components& SystemComponents, size_t SelectedComponent, int MoveType, size_t Location, double2 Scale)
346{
347 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent]) return {0.0, 0.0};
348 //cudaDeviceSynchronize();
349 double start = omp_get_wtime();
350 double alpha = Box.Alpha; double alpha_squared = alpha * alpha;
351 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
352
353 size_t Oldsize = 0; size_t Newsize = 0; size_t chainsize = 0;
354 bool UseTempVector = false; //Zhao's note: Whether or not to use the temporary Vectors (Only used for CBCF Insertion in this function)//
355
356 //If framework molecules are moved, sameType = Framework-Framework, CrossType = Framework-Adsorbate//
357 //If adsorbate Molecules are moved, sameType = Adsorbate-Adsorbate, CrossType = Framework-Adsorbate//
358 Complex* SameType; Complex* CrossType;
359
360 if(SelectedComponent < SystemComponents.NComponents.y)
361 {
362 SameType = Box.FrameworkEik; CrossType = Box.AdsorbateEik;
363 }
364 else
365 {
366 SameType = Box.AdsorbateEik; CrossType = Box.FrameworkEik;
367 }
368 switch(MoveType)
369 {
370 case TRANSLATION: case ROTATION: case SPECIAL_ROTATION: // Translation/Rotation Move //
371 {
372 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
373 Newsize = SystemComponents.Moleculesize[SelectedComponent];
374 chainsize = SystemComponents.Moleculesize[SelectedComponent];
375 break;
376 }
377 case INSERTION: case SINGLE_INSERTION: // Insertion //
378 {
379 Oldsize = 0;
380 Newsize = SystemComponents.Moleculesize[SelectedComponent];
381 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
382 break;
383 }
384 case DELETION: case SINGLE_DELETION: // Deletion //
385 {
386 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
387 Newsize = 0;
388 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
389 break;
390 }
391 case REINSERTION: // Reinsertion //
392 {
393 throw std::runtime_error("Use the Special Function for Reinsertion");
394 //break;
395 }
396 case IDENTITY_SWAP:
397 {
398 throw std::runtime_error("Use the Special Function for IDENTITY SWAP!");
399 }
400 case CBCF_LAMBDACHANGE: // CBCF Lambda Change //
401 {
402 throw std::runtime_error("Use the Special Function for CBCF Lambda Change");
403 //Oldsize = SystemComponents.Moleculesize[SelectedComponent];
404 //Newsize = SystemComponents.Moleculesize[SelectedComponent];
405 //chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
406 //break;
407 }
408 case CBCF_INSERTION: // CBCF Lambda Insertion //
409 {
410 Oldsize = 0;
411 Newsize = SystemComponents.Moleculesize[SelectedComponent];
412 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
413 UseTempVector = true;
414 break;
415 }
416 case CBCF_DELETION: // CBCF Lambda Deletion //
417 {
418 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
419 Newsize = 0;
420 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
421 break;
422 }
423 }
424 size_t numberOfAtoms = Oldsize + Newsize;
425
426 Initialize_EwaldVector_General<<<1,1>>>(Box, Box.kmax, d_a, New, Old, Oldsize, Newsize, SelectedComponent, Location, chainsize, numberOfAtoms, MoveType); checkCUDAErrorEwald("error Initializing Ewald Vectors");
427
428 //Fourier Loop//
429 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
430 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
431
432 //If we separate Host-Guest from Guest-Guest, we can double the Nblock, so the first half does Guest-Guest, and the second half does Host-Guest//
433 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread * sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, UseTempVector, Nblock);
434
435 double sum[Nblock * 2]; double SameSum = 0.0; double CrossSum = 0.0;
436 cudaMemcpy(sum, Blocksum, 2 * Nblock * sizeof(double), cudaMemcpyDeviceToHost); //HG + GG Energies//
437 for(size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
438 for(size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
439 //Zhao's note: when adding fractional molecules, this might not be correct//
440 double deltaExclusion = 0.0;
441
442 if(SystemComponents.CURRENTCYCLE == 4)
443 {
444 printf("Number of blocks: %zu, Nthread: %zu\n", Nblock, Nthread);
445 printf("GPU Fourier Energy (SameType): %.5f, (CrossType) %.5f\n", SameSum, 2.0 * CrossSum);
446 }
447
448 if(SystemComponents.rigid[SelectedComponent])
449 {
450 if(MoveType == INSERTION || MoveType == SINGLE_INSERTION) // Insertion //
451 {
452 //Zhao's note: This is a bit messy, because when creating the molecules at the beginning of the simulation, we need to create a fractional molecule//
453 //MoveType is 2, not 4. 4 is for the insertion after making the old fractional molecule full.//
454 double delta_scale = std::pow(Scale.y,2) - 0.0;
455 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
456 //if(SystemComponents.CURRENTCYCLE == 16) printf("Exclusion energy: %.5f, delta_scale: %.5f, true_val: %.5f\n", deltaExclusion, delta_scale, (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]));
457 SameSum -= deltaExclusion;
458 }
459 else if(MoveType == DELETION || MoveType == SINGLE_DELETION) // Deletion //
460 {
461 double delta_scale = 0.0 - 1.0;
462 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
463 SameSum -= deltaExclusion;
464 }
465 else if(MoveType == CBCF_INSERTION) // CBCF Lambda Insertion //
466 {
467 double delta_scale = std::pow(Scale.y,2) - 0.0;
468 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
469 SameSum -= deltaExclusion;
470 }
471 else if(MoveType == CBCF_DELETION) // CBCF Lambda Deletion //
472 {
473 double delta_scale = 0.0 - std::pow(Scale.y,2);
474 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
475 SameSum -= deltaExclusion;
476 }
477 }
478 //cudaDeviceSynchronize();
479 double end = omp_get_wtime();
480 return {SameSum, 2.0 * CrossSum};
481}
482
483//Zhao's note: THIS IS A SPECIAL FUNCTION JUST FOR REINSERTION//
484double2 GPU_EwaldDifference_Reinsertion(Boxsize& Box, Atoms*& d_a, Atoms& Old, double3* temp, ForceField& FF, double* Blocksum, Components& SystemComponents, size_t SelectedComponent, size_t UpdateLocation)
485{
486 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent]) return {0.0, 0.0};
487 double alpha = Box.Alpha; double alpha_squared = alpha * alpha;
488 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
489
490 size_t numberOfAtoms = SystemComponents.Moleculesize[SelectedComponent];
491 size_t Oldsize = 0; size_t Newsize = numberOfAtoms;
492 //Zhao's note: translation/rotation/reinsertion involves new + old states. Insertion/Deletion only has the new state.
493 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
494 numberOfAtoms += Oldsize;
495
496 Complex* SameType = Box.AdsorbateEik; Complex* CrossType = Box.FrameworkEik;
497
498 // Construct exp(ik.r) for atoms and k-vectors kx, ky, kz = 0, 1 explicitly
499 Initialize_EwaldVector_Reinsertion<<<1,1>>>(Box, Box.kmax, temp, d_a, Old, Oldsize, Newsize, UpdateLocation, numberOfAtoms, SelectedComponent);
500
501 //Fourier Loop//
502 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
503 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
504 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread * sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, false, Nblock);
505 double sum[Nblock * 2]; double SameSum = 0.0; double CrossSum = 0.0;
506 cudaMemcpy(sum, Blocksum, 2 * Nblock * sizeof(double), cudaMemcpyDeviceToHost);
507
508 for(size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
509 for(size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
510
511 return {SameSum, 2.0 * CrossSum};
512}
513
514double2 GPU_EwaldDifference_IdentitySwap(Boxsize& Box, Atoms*& d_a, Atoms& Old, double3* temp, ForceField& FF, double* Blocksum, Components& SystemComponents, size_t OLDComponent, size_t NEWComponent, size_t UpdateLocation)
515{
516 if(FF.noCharges && !SystemComponents.hasPartialCharge[NEWComponent] && !SystemComponents.hasPartialCharge[OLDComponent]) return {0.0, 0.0};
517 double alpha = Box.Alpha; double alpha_squared = alpha * alpha;
518 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
519
520 size_t numberOfAtoms = 0;
521 size_t Oldsize = 0; size_t Newsize = 0;
522 if(SystemComponents.hasPartialCharge[OLDComponent])
523 {
524 Oldsize = SystemComponents.Moleculesize[OLDComponent];
525 }
526 if(SystemComponents.hasPartialCharge[NEWComponent])
527 {
528 Newsize = SystemComponents.Moleculesize[NEWComponent];
529 }
530 numberOfAtoms = Oldsize + Newsize;
531
532 // Construct exp(ik.r) for atoms and k-vectors kx, ky, kz = 0, 1 explicitly
533 Initialize_EwaldVector_IdentitySwap<<<1,1>>>(Box, Box.kmax, temp, d_a, Old, Oldsize, Newsize, UpdateLocation, numberOfAtoms, OLDComponent, NEWComponent);
534
535 Complex* SameType = Box.AdsorbateEik; Complex* CrossType = Box.FrameworkEik;
536 //Fourier Loop//
537 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
538 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
539 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread * sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, false, Nblock);
540 double sum[Nblock * 2]; double SameSum = 0.0; double CrossSum = 0.0;
541 cudaMemcpy(sum, Blocksum, 2 * Nblock * sizeof(double), cudaMemcpyDeviceToHost);
542
543 for(size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
544 for(size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
545
546 //Exclusion parts//
547 if(SystemComponents.rigid[NEWComponent] && SystemComponents.hasPartialCharge[NEWComponent])
548 {
549 double delta_scale = 1.0;
550 double deltaExclusion = (SystemComponents.ExclusionIntra[NEWComponent] + SystemComponents.ExclusionAtom[NEWComponent]) * delta_scale;
551 SameSum -= deltaExclusion;
552 }
553 if(SystemComponents.rigid[OLDComponent] && SystemComponents.hasPartialCharge[OLDComponent])
554 {
555 double delta_scale = -1.0;
556 double deltaExclusion = (SystemComponents.ExclusionIntra[OLDComponent] + SystemComponents.ExclusionAtom[OLDComponent]) * delta_scale;
557 SameSum -= deltaExclusion;
558 }
559
560 return {SameSum, 2.0 * CrossSum};
561}
562
564// Zhao's note: Special function for the Ewald for Lambda change of CBCF //
566__global__ void Fourier_Ewald_Diff_LambdaChange(Boxsize Box, Complex* SameTypeEik, Complex* CrossTypeEik, Atoms Old, double alpha_squared, double prefactor, int3 kmax, size_t Oldsize, size_t Newsize, double* Blocksum, bool UseTempVector, size_t Nblock, double newScale)
567{
568 extern __shared__ double sdata[]; //shared memory for partial sum//
569 size_t kxyz = blockIdx.x * blockDim.x + threadIdx.x;
570 int cache_id = threadIdx.x;
571 size_t i_within_block = kxyz - blockIdx.x * blockDim.x; //for recording the position of the thread within a block
572 double tempE = 0.0;
573 size_t kx_max = kmax.x;
574 size_t ky_max = kmax.y;
575 size_t kz_max = kmax.z;
576 size_t nvec = (kx_max + 1) * (2 * ky_max + 1) * (2 * kz_max + 1);
577
578 if(blockIdx.x >= Nblock) kxyz -= blockDim.x * Nblock; //If Host-Guest Interaction, adjust kxyz//
579 if(kxyz < nvec)
580 {
581 sdata[i_within_block] = 0.0;
582 int kz = kxyz%(2 * kz_max + 1) - kz_max;
583 int kxy = kxyz/(2 * kz_max + 1);
584 int kx = kxy /(2 * ky_max + 1);
585 int ky = kxy %(2 * ky_max + 1) - ky_max;
586 double ksqr = static_cast<double>(kx * kx + ky * ky + kz * kz);
587
588 if(Box.UseLAMMPSEwald) //Overwrite ksqr if we use the LAMMPS Setup for Ewald//
589 {
590 const double lx = Box.Cell[0];
591 const double ly = Box.Cell[4];
592 const double lz = Box.Cell[8];
593 const double xy = Box.Cell[3];
594 const double xz = Box.Cell[6];
595 const double yz = Box.Cell[7];
596 const double ux = 2*M_PI/lx;
597 const double uy = 2*M_PI*(-xy)/lx/ly;
598 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
599 const double vy = 2*M_PI/ly;
600 const double vz = 2*M_PI*(-yz)/ly/lz;
601 const double wz = 2*M_PI/lz;
602 const double kvecx = kx*ux;
603 const double kvecy = kx*uy + ky*vy;
604 const double kvecz = kx*uz + ky*vz + kz*wz;
605 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
606 }
607 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
608 {
609 double3 ax = {Box.InverseCell[0], Box.InverseCell[3], Box.InverseCell[6]};
610 double3 ay = {Box.InverseCell[1], Box.InverseCell[4], Box.InverseCell[7]};
611 double3 az = {Box.InverseCell[2], Box.InverseCell[5], Box.InverseCell[8]};
612 size_t numberOfAtoms = Oldsize;
613 Complex cksum_old; cksum_old.real = 0.0; cksum_old.imag = 0.0;
614 Complex cksum_new; cksum_new.real = 0.0; cksum_new.imag = 0.0;
615 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
616 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
617 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
618 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
619 //OLD, For Lambda Change, there is no change in positions, so just loop over OLD//
620 for(size_t i=0; i<Oldsize; ++i)
621 {
622 Complex eik_temp = Box.eik_y[i + numberOfAtoms * static_cast<size_t>(std::abs(ky))];
623 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
624 Complex eik_xy = multiply(Box.eik_x[i + numberOfAtoms * static_cast<size_t>(kx)], eik_temp);
625
626 eik_temp = Box.eik_z[i + numberOfAtoms * static_cast<size_t>(std::abs(kz))];
627 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
628 double charge = Old.charge[i];
629 double scaling = Old.scaleCoul[i];
630 Complex tempi = multiply(eik_xy, eik_temp);
631
632 cksum_old.real += scaling * charge * tempi.real;
633 cksum_old.imag += scaling * charge * tempi.imag;
634 cksum_new.real += newScale* charge * tempi.real;
635 cksum_new.imag += newScale* charge * tempi.imag;
636 }
637 double3 tempkvec = kvec_x + kvec_y + kvec_z;
638 double rksq = dot(tempkvec, tempkvec);
639 double temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
640 Complex newV; Complex OldV;
641 if(blockIdx.x < Nblock)
642 {
643 if(UseTempVector)
644 {
645 OldV.real = Box.tempEik[kxyz].real; OldV.imag = Box.tempEik[kxyz].imag;
646 }
647 else
648 {
649 OldV.real = SameTypeEik[kxyz].real; OldV.imag = SameTypeEik[kxyz].imag;
650 }
651 newV.real = OldV.real + cksum_new.real - cksum_old.real;
652 newV.imag = OldV.imag + cksum_new.imag - cksum_old.imag;
653 tempE += temp * ComplexNorm(newV);
654 tempE -= temp * ComplexNorm(OldV);
655 Box.tempEik[kxyz] = newV;
656 }
657 else //Host-Guest//
658 {
659 OldV.real = CrossTypeEik[kxyz].real; OldV.imag = CrossTypeEik[kxyz].imag;
660 tempE += temp * (OldV.real * (cksum_new.real - cksum_old.real) + OldV.imag * (cksum_new.imag - cksum_old.imag));
661 }
662 }
663 }
664 sdata[i_within_block] = tempE;
665 __syncthreads();
666 //Partial block sum//
667 int i=blockDim.x / 2;
668 while(i != 0)
669 {
670 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
671 __syncthreads();
672 i /= 2;
673 }
674 if(cache_id == 0) {Blocksum[blockIdx.x] = sdata[0];}
675}
676
677//Zhao's note: THIS IS A SPECIAL FUNCTION JUST FOR LAMBDA MOVE FOR FRACTIONAL MOLECULES//
678double2 GPU_EwaldDifference_LambdaChange(Boxsize& Box, Atoms*& d_a, Atoms& Old, ForceField& FF, double* Blocksum, Components& SystemComponents, size_t SelectedComponent, double2 oldScale, double2 newScale, int MoveType)
679{
680 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent]) return {0.0, 0.0};
681 double alpha = Box.Alpha; double alpha_squared = alpha * alpha;
682 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
683
684 size_t numberOfAtoms = SystemComponents.Moleculesize[SelectedComponent];
685 size_t Oldsize = numberOfAtoms;
686 size_t Newsize = numberOfAtoms;
687 size_t chainsize = SystemComponents.Moleculesize[SelectedComponent];
688
689 bool UseTempVector = false;
690 if(MoveType == CBCF_DELETION) // CBCF Deletion, Lambda Change is its second step //
691 {
692 UseTempVector = true;
693 }
694 // Construct exp(ik.r) for atoms and k-vectors kx, ky, kz = 0, 1 explicitly
695 Initialize_EwaldVector_LambdaChange<<<1,1>>>(Box, Box.kmax, d_a, Old, Oldsize, newScale);
696
697 //Fourier Loop//
698 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
699 size_t Nblock = 0; size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
700
701 Complex* SameType = Box.AdsorbateEik;
702 Complex* CrossType= Box.FrameworkEik;
703
704 Fourier_Ewald_Diff_LambdaChange<<<Nblock * 2, Nthread, Nthread * sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, UseTempVector, Nblock, newScale.y);
705
706 double Host_sum[Nblock * 2]; double SameSum = 0.0; double CrossSum = 0.0;
707 cudaMemcpy(Host_sum, Blocksum, 2 * Nblock * sizeof(double), cudaMemcpyDeviceToHost);
708 for(size_t i = 0; i < Nblock; i++){SameSum += Host_sum[i];}
709 for(size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += Host_sum[i];}
710 //printf("Fourier GPU lambda Change: %.5f\n", tot);
711 double delta_scale = std::pow(newScale.y,2) - std::pow(oldScale.y,2);
712 double deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
713 SameSum -= deltaExclusion;
714 //printf("Lambda Ewald, Same: %.5f, Cross: %.5f\n", SameSum, CrossSum);
715 return {SameSum, 2.0 * CrossSum};
716}
717
718__global__ void Setup_Ewald_Vector(Boxsize Box, Complex* eik_x, Complex* eik_y, Complex* eik_z, Atoms* System, size_t numberOfAtoms, size_t NumberOfComponents, bool UseOffSet)
719{
720 // Construct exp(ik.r) for atoms and k-vectors kx, ky, kz = 0, 1 explicitly
721 //determine the component for i
722 size_t i = blockIdx.x * blockDim.x + threadIdx.x; //number of threads = number of atoms in the system
723 if(i < numberOfAtoms)
724 {
725 //size_t atom_i = i;
726 size_t atom_i = i;
727 Complex tempcomplex; tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
728 size_t comp = 0; size_t totsize = 0;
729 for(size_t ijk = 0; ijk < NumberOfComponents; ijk++)
730 {
731 size_t Atom_ijk = System[ijk].size;
732 totsize += Atom_ijk;
733 if(atom_i >= totsize)
734 {
735 comp ++;
736 atom_i -= Atom_ijk;
737 }
738 }
739 if(UseOffSet)
740 {
741 size_t HalfAllocateSize = System[comp].Allocate_size / 2;
742 atom_i += HalfAllocateSize;
743 }
744 double3 pos;
745 pos = System[comp].pos[atom_i];
746 eik_x[i + 0 * numberOfAtoms] = tempcomplex;
747 eik_y[i + 0 * numberOfAtoms] = tempcomplex;
748 eik_z[i + 0 * numberOfAtoms] = tempcomplex;
749 double3 s; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
750 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); eik_x[i + 1 * numberOfAtoms] = tempcomplex;
751 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); eik_y[i + 1 * numberOfAtoms] = tempcomplex;
752 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); eik_z[i + 1 * numberOfAtoms] = tempcomplex;
753 // Calculate remaining positive kx, ky and kz by recurrence
754 for(size_t kx = 2; kx <= Box.kmax.x; ++kx)
755 {
756 eik_x[i + kx * numberOfAtoms] = multiply(eik_x[i + (kx - 1) * numberOfAtoms], eik_x[i + 1 * numberOfAtoms]);
757 }
758 //printf("BEFORE THE LOOP! eik_y[1] = %.5f %.5f\n", eik_y[1].real, eik_y[1].imag);
759
760 for(size_t ky = 2; ky <= Box.kmax.y; ++ky)
761 {
762 eik_y[i + ky * numberOfAtoms] = multiply(eik_y[i + (ky - 1) * numberOfAtoms], eik_y[i + 1 * numberOfAtoms]);
763 }
764
765 for(size_t kz = 2; kz <= Box.kmax.z; ++kz)
766 {
767 eik_z[i + kz * numberOfAtoms] = multiply(eik_z[i + (kz - 1) * numberOfAtoms], eik_z[i + 1 * numberOfAtoms]);
768 }
769 }
770}
771
772//Zhao's note: Idea was that every block considers a grid point for Ewald (a set of kxyz)//
773//So the number of atoms each thread needs to consider = TotalAtoms / Nthreads per block (usually 128)//
774__global__ void TotalEwald(Atoms* d_a, Boxsize Box, double* BlockSum, Complex* eik_x, Complex* eik_y, Complex* eik_z, Complex* Eik, size_t totAtom, size_t Ncomp, size_t NAtomPerThread, size_t residueAtoms)
775{
776 __shared__ double3 sdata[128]; //maybe just need Complex//
777 int cache_id = threadIdx.x;
778 size_t total_ij = blockIdx.x * blockDim.x + threadIdx.x;
779 size_t ij_within_block = total_ij - blockIdx.x * blockDim.x;
780
781 size_t kxyz = blockIdx.x; //Each block takes over one grid point//
782 size_t kx_max = Box.kmax.x;
783 size_t ky_max = Box.kmax.y;
784 size_t kz_max = Box.kmax.z;
785 //size_t nvec = (kx_max + 1) * (2 * ky_max + 1) * (2 * kz_max + 1);
786 int kz = kxyz%(2 * kz_max + 1) - kz_max;
787 int kxy = kxyz/(2 * kz_max + 1);
788 int kx = kxy /(2 * ky_max + 1);
789 int ky = kxy %(2 * ky_max + 1) - ky_max;
790 double ksqr = static_cast<double>(kx * kx + ky * ky + kz * kz);
791
792 double alpha = Box.Alpha; double alpha_squared = alpha * alpha;
793 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
794
795 double3 ax = {Box.InverseCell[0], Box.InverseCell[3], Box.InverseCell[6]};
796 double3 ay = {Box.InverseCell[1], Box.InverseCell[4], Box.InverseCell[7]};
797 double3 az = {Box.InverseCell[2], Box.InverseCell[5], Box.InverseCell[8]};
798 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
799 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
800 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
801 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
802
803 double3 tempkvec = kvec_x + kvec_y + kvec_z;
804 double rksq = dot(tempkvec, tempkvec);
805 double temp = 0.0;
806
807 if(Box.UseLAMMPSEwald) //Overwrite ksqr if we use the LAMMPS Setup for Ewald//
808 {
809 const double lx = Box.Cell[0];
810 const double ly = Box.Cell[4];
811 const double lz = Box.Cell[8];
812 const double xy = Box.Cell[3];
813 const double xz = Box.Cell[6];
814 const double yz = Box.Cell[7];
815 const double ux = 2*M_PI/lx;
816 const double uy = 2*M_PI*(-xy)/lx/ly;
817 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
818 const double vy = 2*M_PI/ly;
819 const double vz = 2*M_PI*(-yz)/ly/lz;
820 const double wz = 2*M_PI/lz;
821 const double kvecx = kx*ux;
822 const double kvecy = kx*uy + ky*vy;
823 const double kvecz = kx*uz + ky*vz + kz*wz;
824 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
825 }
826 Complex cksum; cksum.real = 0.0; cksum.imag = 0.0;
827 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
828 {
829 temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
830 for(size_t a = 0; a < NAtomPerThread; a++)
831 {
832 size_t Atom = a + NAtomPerThread * ij_within_block;
833 size_t AbsoluteAtom = Atom; size_t comp = 0; size_t totsize = 0;
834 for(size_t it_comp = 0; it_comp < Ncomp; it_comp++)
835 {
836 size_t Atom_ijk = d_a[it_comp].size;
837 totsize += Atom_ijk;
838 if(Atom >= totsize)
839 {
840 comp++;
841 Atom -= d_a[it_comp].size;
842 }
843 }
844 Complex eik_temp = eik_y[AbsoluteAtom + totAtom * static_cast<size_t>(std::abs(ky))];
845 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
846 Complex eik_xy = multiply(eik_x[AbsoluteAtom + totAtom * static_cast<size_t>(kx)], eik_temp);
847 eik_temp = eik_z[AbsoluteAtom + totAtom * static_cast<size_t>(std::abs(kz))];
848 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
849
850 double charge = d_a[comp].charge[Atom];
851 double scaling = d_a[comp].scaleCoul[Atom];
852 Complex tempi = multiply(eik_xy, eik_temp);
853 cksum.real += scaling * charge * tempi.real;
854 cksum.imag += scaling * charge * tempi.imag;
855 }
856 if(residueAtoms > 0)
857 {
858 if(ij_within_block < residueAtoms) //the thread will read one more atom in the residueAtoms//
859 //The thread from zero to residueAtoms will each take another atom//
860 {
861 //Zhao's note: blockDim = number of threads in a block, blockDim.x * NAtomPerThread = the last atom position if there is no residue//
862 size_t Atom = blockDim.x * NAtomPerThread + ij_within_block;
863 size_t AbsoluteAtom = Atom; size_t comp = 0; size_t totsize = 0;
864 for(size_t it_comp = 0; it_comp < Ncomp; it_comp++)
865 {
866 size_t Atom_ijk = d_a[it_comp].size;
867 totsize += Atom_ijk;
868 if(Atom >= totsize)
869 {
870 comp++;
871 Atom -= d_a[it_comp].size;
872 }
873 }
874 Complex eik_temp = eik_y[AbsoluteAtom + totAtom * static_cast<size_t>(std::abs(ky))];
875 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
876 Complex eik_xy = multiply(eik_x[AbsoluteAtom + totAtom * static_cast<size_t>(kx)], eik_temp);
877 eik_temp = eik_z[AbsoluteAtom + totAtom * static_cast<size_t>(std::abs(kz))];
878 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
879
880 double charge = d_a[comp].charge[Atom];
881 double scaling = d_a[comp].scaleCoul[Atom];
882 Complex tempi = multiply(eik_xy, eik_temp);
883 cksum.real += scaling * charge * tempi.real;
884 cksum.imag += scaling * charge * tempi.imag;
885 }
886 }
887 //double2 TEMP; TEMP.x = cksum.real; TEMP.y = cksum.imag;
888 }
889 sdata[ij_within_block].x = cksum.real;
890 sdata[ij_within_block].y = cksum.imag;
891 __syncthreads();
892 //Partial block sum//
893 int i=blockDim.x / 2;
894 while(i != 0)
895 {
896 if(cache_id < i)
897 {
898 sdata[cache_id].x += sdata[cache_id + i].x;
899 sdata[cache_id].y += sdata[cache_id + i].y;
900 }
901 __syncthreads();
902 i /= 2;
903 }
904 if(cache_id == 0)
905 {
906 Complex cksum; cksum.real = sdata[0].x; cksum.imag = sdata[0].y;
907 double NORM = ComplexNorm(cksum);
908 BlockSum[blockIdx.x] = temp * NORM;
909 Eik[blockIdx.x].real = sdata[0].x;
910 Eik[blockIdx.x].imag = sdata[0].y;
911 }
912}
913
914__global__ double Calculate_Intra_Self_Exclusion(Boxsize Box, Atoms* System, double* BlockSum, size_t SelectedComponent)
915{
916 double E = 0.0; double prefactor_self = Box.Prefactor * Box.Alpha / std::sqrt(M_PI);
917 extern __shared__ double sdata[];
918 sdata[threadIdx.x] = 0.0; int cache_id = threadIdx.x;
919 //Intra
920 //Option 1: Unwrap upper triangle for pairwise interaction
921 //https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix
922 //MolA = NAds - 2 - std::floor(std::sqrt(-8*Ads_i + 4*NAds*(NAds-1)-7)/2.0 - 0.5);
923 //MolB = Ads_i + MolA + 1 - NAds*(NAds-1)/2 + (NAds-MolA)*((NAds- MolA)-1)/2;
924 //Option 2: each thread takes care of a molecule, easier
925 size_t THREADIdx = blockIdx.x * blockDim.x + threadIdx.x;
926 size_t Molsize = System[SelectedComponent].Molsize;
927 size_t AtomIdx = THREADIdx * Molsize;
928
929 size_t totalmol = System[SelectedComponent].size / Molsize;
930 if(AtomIdx < System[SelectedComponent].size)
931 {
932 //if(THREADIdx == 0) printf("Molsize: %lu, AtomIdx: %lu\n", Molsize, AtomIdx);
933 for(size_t i = AtomIdx; i != Molsize + AtomIdx; i++)
934 {
935 double factorA = System[SelectedComponent].scaleCoul[i] * System[SelectedComponent].charge[i];
936 double3 posA = System[SelectedComponent].pos[i];
937 //for(size_t j = i; j != Molsize; j++)
938 for(size_t j = i; j != Molsize + AtomIdx; j++)
939 {
940 if(i == j)
941 //Self: atom and itself
942 {
943 E += prefactor_self * factorA * factorA;
944 }
945
946 else//if(j < i)
947 //Intra: within a molecule
948 {
949 double factorB = System[SelectedComponent].scaleCoul[j] * System[SelectedComponent].charge[j];
950 double3 posB = System[SelectedComponent].pos[j];
951 double3 posvec = posA - posB;
952 PBC(posvec, Box.Cell, Box.InverseCell, Box.Cubic);
953 double rr_dot = dot(posvec, posvec);
954 double r = std::sqrt(rr_dot);
955 E += Box.Prefactor * factorA * factorB * std::erf(Box.Alpha * r) / r;
956 }
957
958 }
959 }
960 }
961 //if(THREADIdx == 0) printf("ThreadID 0, ThreadE: %.5f", E);
962 sdata[threadIdx.x] = -E;
963 __syncthreads();
964 //Partial block sum//
965 int i=blockDim.x / 2;
966 while(i != 0)
967 {
968 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
969 __syncthreads();
970 i /= 2;
971 }
972 if(threadIdx.x == 0)
973 {
974 BlockSum[blockIdx.x] += sdata[0];
975
976 }
977}
978
979MoveEnergy Ewald_TotalEnergy(Simulations& Sim, Components& SystemComponents, bool UseOffSet)
980{
981 size_t NTotalAtom = 0;
982 size_t Nblock = 0;
983 size_t Nthread = 128;
984
985 MoveEnergy E;
986
987 Boxsize Box= Sim.Box;
988 Atoms* d_a = Sim.d_a;
989
990 for(size_t i = 0; i < SystemComponents.Total_Components; i++)
991 NTotalAtom += SystemComponents.NumberOfMolecule_for_Component[i] * SystemComponents.Moleculesize[i];
992 if(NTotalAtom > 0)
993 {
994 size_t NAtomPerThread = NTotalAtom / Nthread;
995 size_t residueAtoms = NTotalAtom % Nthread;
996 //Setup eikx, eiky, and eikz//
997 Setup_threadblock(NTotalAtom, &Nblock, &Nthread);
998
999 Complex* eikx; cudaMalloc(&eikx, NTotalAtom * (Box.kmax.x + 1) * sizeof(Complex));
1000 Complex* eiky; cudaMalloc(&eiky, NTotalAtom * (Box.kmax.y + 1) * sizeof(Complex));
1001 Complex* eikz; cudaMalloc(&eikz, NTotalAtom * (Box.kmax.z + 1) * sizeof(Complex));
1002 Setup_Ewald_Vector<<<Nblock, Nthread>>>(Box, eikx, eiky, eikz, d_a, NTotalAtom, SystemComponents.Total_Components, UseOffSet);
1003
1004 Nblock = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
1005 //printf("Nblocks for Ewald Total: %zu, allocated %zu blocks\n", Nblock, Sim.Nblocks);
1006 if(Nblock > Sim.Nblocks)
1007 {
1008 printf("Need to Allocate more space for blocksum\n");
1009 cudaMalloc(&Sim.Blocksum, Nblock * sizeof(double));
1010 }
1011 Nthread= 128;
1012
1013 //printf("Parallel Total Ewald (Calculation), Nblock: %zu, Nthread: %zu, NAtomPerThread: %zu, residue: %zu\n", Nblock, Nthread, NAtomPerThread, residueAtoms);
1014 //printf("kmax: %d %d %d, RecipCutoff: %.5f\n", Box.kmax.x, Box.kmax.y, Box.kmax.z, Box.ReciprocalCutOff);
1015 TotalEwald<<<Nblock, Nthread, Nthread * sizeof(double)>>>(d_a, Box, Sim.Blocksum, eikx, eiky, eikz, Box.tempEik, NTotalAtom, SystemComponents.Total_Components, NAtomPerThread, residueAtoms);
1016 cudaFree(eikx); cudaFree(eiky); cudaFree(eikz);
1017
1018 double HostTotEwald[Nblock];
1019 /*
1020 cudaMemcpy(HostTotEwald, Sim.Blocksum, Nblock * sizeof(double), cudaMemcpyDeviceToHost);
1021 for(size_t i = 0; i < Nblock; i++) E.GGEwaldE += HostTotEwald[i];
1022 //printf("Total Fourier: %.10f\n", TotEwald);
1023 //Add exclusion part //
1024 double ExclusionE = 0.0;
1025 for(size_t i = 0; i < SystemComponents.Total_Components; i++)
1026 {
1027 double Nfull = (double) SystemComponents.NumberOfMolecule_for_Component[i];
1028 if(SystemComponents.hasfractionalMolecule[i])
1029 {
1030 Nfull--;
1031 double delta = SystemComponents.Lambda[i].delta;
1032 size_t Bin = SystemComponents.Lambda[i].currentBin;
1033 double Lambda = delta * static_cast<double>(Bin);
1034 double2 Scale = SystemComponents.Lambda[i].SET_SCALE(Lambda);
1035 Nfull += std::pow(Scale.y,2);
1036 }
1037 ExclusionE = (SystemComponents.ExclusionIntra[i] + SystemComponents.ExclusionAtom[i]) * Nfull;
1038 E.GGEwaldE-= ExclusionE;
1039 }
1040 */
1041 double GGFourier = 0.0; cudaMemcpy(HostTotEwald, Sim.Blocksum, Nblock * sizeof(double), cudaMemcpyDeviceToHost);
1042 //DEBUG//
1043 //for(size_t i = 0; i < Nblock; i++) GGFourier += HostTotEwald[i];
1044 //printf("After fourier, GGFourier: %.5f,Fourier blocks: %zu\n", GGFourier, Nblock);
1045
1046 //for(size_t i = 0; i < Nblock; i++) HostTotEwald[i] = 0.0;
1047 //cudaMemcpy(Sim.Blocksum, HostTotEwald, Nblock * sizeof(double), cudaMemcpyHostToDevice);
1048 //DEBUG END//
1049
1050 //Recalculate exclusion part of Ewald summation//
1051 size_t maxblock = Nblock;
1052 for(size_t i = SystemComponents.NComponents.y; i < SystemComponents.NComponents.x; i++)
1053 {
1054 if(SystemComponents.NumberOfMolecule_for_Component[i] == 0) continue;
1055 size_t Exclusion_Nblock = 0; size_t Exclusion_Nthread = 0;
1056 Setup_threadblock(SystemComponents.NumberOfMolecule_for_Component[i], &Exclusion_Nblock, &Exclusion_Nthread);
1057 printf("Component %zu, Nblock: %zu, Nthread: %zu\n", i, Exclusion_Nblock, Exclusion_Nthread);
1058 Calculate_Intra_Self_Exclusion<<<Exclusion_Nblock, Exclusion_Nthread, Exclusion_Nthread * sizeof(double)>>>(Box, d_a, Sim.Blocksum, i); checkCUDAErrorEwald("error Calculating Intra Self Exclusion for Ewald Summation for Total Ewald summation!!!");
1059 //if(Exclusion_Nblock > maxblock) maxblock = Exclusion_Nblock; //Zhao's note: you might run into special cases, but still there are special treatments that needs to be done (currently not done)
1060 //Curently we are assuming that the number of blocks for self+exclusion is smaller than total number of kspace points (99.99999% true)
1061 }
1062
1063 //Zhao's note: assume that you use less blocks for intra-self exclusion
1064 cudaMemcpy(HostTotEwald, Sim.Blocksum, maxblock * sizeof(double), cudaMemcpyDeviceToHost);
1065 for(size_t i = 0; i < maxblock; i++) E.GGEwaldE += HostTotEwald[i];
1066 printf("GPU Ewald: %.5f\n", E.GGEwaldE);
1067 }
1068 return E;
1069}
Definition data_struct.h:746
Definition data_struct.h:819
Definition data_struct.h:46
Definition data_struct.h:843
Definition data_struct.h:794
Definition data_struct.h:615
Definition data_struct.h:1027