2#include "ewald_preparation.h"
3inline void checkCUDAErrorEwald(
const char *msg)
5 cudaError_t err = cudaGetLastError();
6 if( cudaSuccess != err)
8 printf(
"CUDA Error: %s: %s.\n", msg, cudaGetErrorString(err) );
28__device__
double ComplexNorm(
Complex a)
30 return a.real * a.real + a.imag * a.imag;
36 c.real = a.real*b.real - a.imag*b.imag;
37 c.imag = a.real*b.imag + a.imag*b.real;
41__device__
void Initialize_Vectors(
Boxsize Box,
size_t Oldsize,
size_t Newsize,
Atoms Old,
size_t numberOfAtoms, int3 kmax)
47 Complex tempcomplex; tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
48 for(
size_t posi=0; posi < Oldsize; ++posi)
50 tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
51 double3 pos = Old.pos[posi];
52 Box.eik_x[posi + 0 * numberOfAtoms] = tempcomplex;
53 Box.eik_y[posi + 0 * numberOfAtoms] = tempcomplex;
54 Box.eik_z[posi + 0 * numberOfAtoms] = tempcomplex;
55 double3 s; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
56 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[posi + 1 * numberOfAtoms] = tempcomplex;
57 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[posi + 1 * numberOfAtoms] = tempcomplex;
58 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[posi + 1 * numberOfAtoms] = tempcomplex;
61 for(
size_t posi=Oldsize; posi < Oldsize + Newsize; ++posi)
63 tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
64 double3 pos = Old.pos[posi];
65 Box.eik_x[posi + 0 * numberOfAtoms] = tempcomplex;
66 Box.eik_y[posi + 0 * numberOfAtoms] = tempcomplex;
67 Box.eik_z[posi + 0 * numberOfAtoms] = tempcomplex;
68 double3 s ; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
69 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[posi + 1 * numberOfAtoms] = tempcomplex;
70 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[posi + 1 * numberOfAtoms] = tempcomplex;
71 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[posi + 1 * numberOfAtoms] = tempcomplex;
74 for(
size_t kx = 2; kx <= kx_max; ++kx)
76 for(
size_t i = 0; i != numberOfAtoms; ++i)
78 Box.eik_x[i + kx * numberOfAtoms] = multiply(Box.eik_x[i + (kx - 1) * numberOfAtoms], Box.eik_x[i + 1 * numberOfAtoms]);
81 for(
size_t ky = 2; ky <= ky_max; ++ky)
83 for(
size_t i = 0; i != numberOfAtoms; ++i)
85 Box.eik_y[i + ky * numberOfAtoms] = multiply(Box.eik_y[i + (ky - 1) * numberOfAtoms], Box.eik_y[i + 1 * numberOfAtoms]);
88 for(
size_t kz = 2; kz <= kz_max; ++kz)
90 for(
size_t i = 0; i != numberOfAtoms; ++i)
92 Box.eik_z[i + kz * numberOfAtoms] = multiply(Box.eik_z[i + (kz - 1) * numberOfAtoms], Box.eik_z[i + 1 * numberOfAtoms]);
97__global__
void Initialize_EwaldVector_General(
Boxsize Box, int3 kmax,
Atoms* d_a,
Atoms New,
Atoms Old,
size_t Oldsize,
size_t Newsize,
size_t SelectedComponent,
size_t Location,
size_t chainsize,
size_t numberOfAtoms,
int MoveType)
100 if(MoveType == TRANSLATION || MoveType == ROTATION || MoveType == SPECIAL_ROTATION || MoveType == SINGLE_INSERTION || MoveType == SINGLE_DELETION)
103 for(
size_t i = Oldsize; i < Oldsize + Newsize; i++)
105 Old.pos[i] = New.pos[i - Oldsize];
106 Old.scale[i] = New.scale[i - Oldsize];
107 Old.charge[i] = New.charge[i - Oldsize];
108 Old.scaleCoul[i] = New.scaleCoul[i - Oldsize];
111 else if(MoveType == INSERTION || MoveType == CBCF_INSERTION)
114 for(
size_t i = 0; i < chainsize; i++)
116 Old.pos[i + 1] = New.pos[Location * chainsize + i];
117 Old.scale[i + 1] = New.scale[Location * chainsize + i];
118 Old.charge[i + 1] = New.charge[Location * chainsize + i];
119 Old.scaleCoul[i + 1] = New.scaleCoul[Location * chainsize + i];
122 else if(MoveType == DELETION || MoveType == CBCF_DELETION)
124 for(
size_t i = 0; i < Oldsize; i++)
127 Old.pos[i] = d_a[SelectedComponent].pos[Location + i];
128 Old.scale[i] = d_a[SelectedComponent].scale[Location + i];
129 Old.charge[i] = d_a[SelectedComponent].charge[Location + i];
130 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[Location + i];
133 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
136__global__
void Initialize_EwaldVector_LambdaChange(
Boxsize Box, int3 kmax,
Atoms* d_a,
Atoms Old,
size_t Oldsize, double2 newScale)
138 Initialize_Vectors(Box, Oldsize, 0, Old, Oldsize, kmax);
141__global__
void Initialize_EwaldVector_Reinsertion(
Boxsize Box, int3 kmax, double3* temp,
Atoms* d_a,
Atoms Old,
size_t Oldsize,
size_t Newsize,
size_t realpos,
size_t numberOfAtoms,
size_t SelectedComponent)
143 for(
size_t i = 0; i < Oldsize; i++)
145 Old.pos[i] = d_a[SelectedComponent].pos[realpos + i];
146 Old.scale[i] = d_a[SelectedComponent].scale[realpos + i];
147 Old.charge[i] = d_a[SelectedComponent].charge[realpos + i];
148 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[realpos + i];
151 for(
size_t i = Oldsize; i < Oldsize + Newsize; i++)
153 Old.pos[i] = temp[i - Oldsize];
154 Old.scale[i] = d_a[SelectedComponent].scale[realpos + i - Oldsize];
155 Old.charge[i] = d_a[SelectedComponent].charge[realpos + i - Oldsize];
156 Old.scaleCoul[i] = d_a[SelectedComponent].scaleCoul[realpos + i - Oldsize];
158 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
161__global__
void Initialize_EwaldVector_IdentitySwap(
Boxsize Box, int3 kmax, double3* temp,
Atoms* d_a,
Atoms Old,
size_t Oldsize,
size_t Newsize,
size_t realpos,
size_t numberOfAtoms,
size_t OLDComponent,
size_t NEWComponent)
163 for(
size_t i = 0; i < Oldsize; i++)
165 Old.pos[i] = d_a[OLDComponent].pos[realpos + i];
166 Old.scale[i] = d_a[OLDComponent].scale[realpos + i];
167 Old.charge[i] = d_a[OLDComponent].charge[realpos + i];
168 Old.scaleCoul[i] = d_a[OLDComponent].scaleCoul[realpos + i];
172 for(
size_t i = Oldsize; i < Oldsize + Newsize; i++)
174 Old.pos[i] = temp[i - Oldsize];
176 Old.charge[i] = d_a[NEWComponent].charge[i - Oldsize];
177 Old.scaleCoul[i] = 1.0;
179 Initialize_Vectors(Box, Oldsize, Newsize, Old, numberOfAtoms, kmax);
182__global__
void JustStore_Ewald(
Boxsize Box,
size_t nvec)
184 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
185 if(i < nvec) Box.tempEik[i] = Box.AdsorbateEik[i];
192__global__
void Fourier_Ewald_Diff(
Boxsize Box,
Complex* SameTypeEik,
Complex* CrossTypeEik,
Atoms Old,
double alpha_squared,
double prefactor, int3 kmax,
size_t Oldsize,
size_t Newsize,
double* Blocksum,
bool UseTempVector,
size_t Nblock)
196 extern __shared__
double sdata[];
197 size_t kxyz = blockIdx.x * blockDim.x + threadIdx.x;
198 int cache_id = threadIdx.x;
199 size_t i_within_block = kxyz - blockIdx.x * blockDim.x;
201 size_t kx_max = kmax.x;
202 size_t ky_max = kmax.y;
203 size_t kz_max = kmax.z;
204 size_t nvec = (kx_max + 1) * (2 * ky_max + 1) * (2 * kz_max + 1);
206 if(blockIdx.x >= Nblock) kxyz -= blockDim.x * Nblock;
210 sdata[i_within_block] = 0.0;
211 int kz = kxyz%(2 * kz_max + 1) - kz_max;
212 int kxy = kxyz/(2 * kz_max + 1);
213 int kx = kxy /(2 * ky_max + 1);
214 int ky = kxy %(2 * ky_max + 1) - ky_max;
215 double ksqr =
static_cast<double>(kx * kx + ky * ky + kz * kz);
216 if(Box.UseLAMMPSEwald)
218 const double lx = Box.Cell[0];
219 const double ly = Box.Cell[4];
220 const double lz = Box.Cell[8];
221 const double xy = Box.Cell[3];
222 const double xz = Box.Cell[6];
223 const double yz = Box.Cell[7];
224 const double ux = 2*M_PI/lx;
225 const double uy = 2*M_PI*(-xy)/lx/ly;
226 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
227 const double vy = 2*M_PI/ly;
228 const double vz = 2*M_PI*(-yz)/ly/lz;
229 const double wz = 2*M_PI/lz;
230 const double kvecx = kx*ux;
231 const double kvecy = kx*uy + ky*vy;
232 const double kvecz = kx*uz + ky*vz + kz*wz;
233 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
235 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
237 double3 ax; ax.x = Box.InverseCell[0]; ax.y = Box.InverseCell[3]; ax.z = Box.InverseCell[6];
238 double3 ay; ay.x = Box.InverseCell[1]; ay.y = Box.InverseCell[4]; ay.z = Box.InverseCell[7];
239 double3 az; az.x = Box.InverseCell[2]; az.y = Box.InverseCell[5]; az.z = Box.InverseCell[8];
240 size_t numberOfAtoms = Oldsize + Newsize;
241 Complex cksum_old; cksum_old.real = 0.0; cksum_old.imag = 0.0;
242 Complex cksum_new; cksum_new.real = 0.0; cksum_new.imag = 0.0;
243 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
244 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
245 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
246 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
248 for(
size_t i=0; i<Oldsize + Newsize; ++i)
250 Complex eik_temp = Box.eik_y[i + numberOfAtoms *
static_cast<size_t>(std::abs(ky))];
251 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
252 Complex eik_xy = multiply(Box.eik_x[i + numberOfAtoms *
static_cast<size_t>(kx)], eik_temp);
254 eik_temp = Box.eik_z[i + numberOfAtoms *
static_cast<size_t>(std::abs(kz))];
255 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
256 double charge = Old.charge[i];
257 double scaling = Old.scaleCoul[i];
258 Complex tempi = multiply(eik_xy, eik_temp);
261 cksum_old.real += scaling * charge * tempi.real;
262 cksum_old.imag += scaling * charge * tempi.imag;
266 cksum_new.real += scaling * charge * tempi.real;
267 cksum_new.imag += scaling * charge * tempi.imag;
270 double3 tempkvec = kvec_x + kvec_y + kvec_z;
271 double rksq = dot(tempkvec, tempkvec);
272 double temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
275 if(blockIdx.x < Nblock)
279 OldV.real = Box.tempEik[kxyz].real; OldV.imag = Box.tempEik[kxyz].imag;
283 OldV.real = SameTypeEik[kxyz].real; OldV.imag = SameTypeEik[kxyz].imag;
285 newV.real = OldV.real + cksum_new.real - cksum_old.real;
286 newV.imag = OldV.imag + cksum_new.imag - cksum_old.imag;
287 tempE += temp * ComplexNorm(newV);
288 tempE -= temp * ComplexNorm(OldV);
289 Box.tempEik[kxyz] = newV;
293 OldV.real = CrossTypeEik[kxyz].real; OldV.imag = CrossTypeEik[kxyz].imag;
294 tempE += temp * (OldV.real * (cksum_new.real - cksum_old.real) + OldV.imag * (cksum_new.imag - cksum_old.imag));
298 sdata[i_within_block] = tempE;
301 int i=blockDim.x / 2;
304 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
308 if(cache_id == 0) {Blocksum[blockIdx.x] = sdata[0];}
313 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
314 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
315 JustStore_Ewald<<<Nblock, Nthread>>>(Box, numberOfWaveVectors);
318__global__
void Update_Ewald_Stored(
Complex* Eik,
Complex* Temp_Eik,
size_t nvec)
320 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
321 if(i < nvec) Eik[i] = Temp_Eik[i];
323void Update_Ewald_Vector(
Boxsize& Box,
bool CPU,
Components& SystemComponents,
size_t SelectedComponent)
327 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
328 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
330 if(SelectedComponent < SystemComponents.NComponents.y)
332 Stored_Eik = Box.FrameworkEik;
336 Stored_Eik = Box.AdsorbateEik;
338 Update_Ewald_Stored<<<Nblock, Nthread>>>(Stored_Eik, Box.tempEik, numberOfWaveVectors);
345double2 GPU_EwaldDifference_General(
Boxsize& Box,
Atoms*& d_a,
Atoms& New,
Atoms& Old,
ForceField& FF,
double* Blocksum,
Components& SystemComponents,
size_t SelectedComponent,
int MoveType,
size_t Location, double2 Scale)
347 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent])
return {0.0, 0.0};
349 double start = omp_get_wtime();
350 double alpha = Box.Alpha;
double alpha_squared = alpha * alpha;
351 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
353 size_t Oldsize = 0;
size_t Newsize = 0;
size_t chainsize = 0;
354 bool UseTempVector =
false;
360 if(SelectedComponent < SystemComponents.NComponents.y)
362 SameType = Box.FrameworkEik; CrossType = Box.AdsorbateEik;
366 SameType = Box.AdsorbateEik; CrossType = Box.FrameworkEik;
370 case TRANSLATION:
case ROTATION:
case SPECIAL_ROTATION:
372 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
373 Newsize = SystemComponents.Moleculesize[SelectedComponent];
374 chainsize = SystemComponents.Moleculesize[SelectedComponent];
377 case INSERTION:
case SINGLE_INSERTION:
380 Newsize = SystemComponents.Moleculesize[SelectedComponent];
381 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
384 case DELETION:
case SINGLE_DELETION:
386 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
388 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
393 throw std::runtime_error(
"Use the Special Function for Reinsertion");
398 throw std::runtime_error(
"Use the Special Function for IDENTITY SWAP!");
400 case CBCF_LAMBDACHANGE:
402 throw std::runtime_error(
"Use the Special Function for CBCF Lambda Change");
411 Newsize = SystemComponents.Moleculesize[SelectedComponent];
412 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
413 UseTempVector =
true;
418 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
420 chainsize = SystemComponents.Moleculesize[SelectedComponent] - 1;
424 size_t numberOfAtoms = Oldsize + Newsize;
426 Initialize_EwaldVector_General<<<1,1>>>(Box, Box.kmax, d_a, New, Old, Oldsize, Newsize, SelectedComponent, Location, chainsize, numberOfAtoms, MoveType); checkCUDAErrorEwald(
"error Initializing Ewald Vectors");
429 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
430 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
433 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread *
sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, UseTempVector, Nblock);
435 double sum[Nblock * 2];
double SameSum = 0.0;
double CrossSum = 0.0;
436 cudaMemcpy(sum, Blocksum, 2 * Nblock *
sizeof(
double), cudaMemcpyDeviceToHost);
437 for(
size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
438 for(
size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
440 double deltaExclusion = 0.0;
442 if(SystemComponents.CURRENTCYCLE == 4)
444 printf(
"Number of blocks: %zu, Nthread: %zu\n", Nblock, Nthread);
445 printf(
"GPU Fourier Energy (SameType): %.5f, (CrossType) %.5f\n", SameSum, 2.0 * CrossSum);
448 if(SystemComponents.rigid[SelectedComponent])
450 if(MoveType == INSERTION || MoveType == SINGLE_INSERTION)
454 double delta_scale = std::pow(Scale.y,2) - 0.0;
455 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
457 SameSum -= deltaExclusion;
459 else if(MoveType == DELETION || MoveType == SINGLE_DELETION)
461 double delta_scale = 0.0 - 1.0;
462 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
463 SameSum -= deltaExclusion;
465 else if(MoveType == CBCF_INSERTION)
467 double delta_scale = std::pow(Scale.y,2) - 0.0;
468 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
469 SameSum -= deltaExclusion;
471 else if(MoveType == CBCF_DELETION)
473 double delta_scale = 0.0 - std::pow(Scale.y,2);
474 deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
475 SameSum -= deltaExclusion;
479 double end = omp_get_wtime();
480 return {SameSum, 2.0 * CrossSum};
484double2 GPU_EwaldDifference_Reinsertion(
Boxsize& Box,
Atoms*& d_a,
Atoms& Old, double3* temp,
ForceField& FF,
double* Blocksum,
Components& SystemComponents,
size_t SelectedComponent,
size_t UpdateLocation)
486 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent])
return {0.0, 0.0};
487 double alpha = Box.Alpha;
double alpha_squared = alpha * alpha;
488 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
490 size_t numberOfAtoms = SystemComponents.Moleculesize[SelectedComponent];
491 size_t Oldsize = 0;
size_t Newsize = numberOfAtoms;
493 Oldsize = SystemComponents.Moleculesize[SelectedComponent];
494 numberOfAtoms += Oldsize;
496 Complex* SameType = Box.AdsorbateEik;
Complex* CrossType = Box.FrameworkEik;
499 Initialize_EwaldVector_Reinsertion<<<1,1>>>(Box, Box.kmax, temp, d_a, Old, Oldsize, Newsize, UpdateLocation, numberOfAtoms, SelectedComponent);
502 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
503 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
504 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread *
sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum,
false, Nblock);
505 double sum[Nblock * 2];
double SameSum = 0.0;
double CrossSum = 0.0;
506 cudaMemcpy(sum, Blocksum, 2 * Nblock *
sizeof(
double), cudaMemcpyDeviceToHost);
508 for(
size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
509 for(
size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
511 return {SameSum, 2.0 * CrossSum};
514double2 GPU_EwaldDifference_IdentitySwap(
Boxsize& Box,
Atoms*& d_a,
Atoms& Old, double3* temp,
ForceField& FF,
double* Blocksum,
Components& SystemComponents,
size_t OLDComponent,
size_t NEWComponent,
size_t UpdateLocation)
516 if(FF.noCharges && !SystemComponents.hasPartialCharge[NEWComponent] && !SystemComponents.hasPartialCharge[OLDComponent])
return {0.0, 0.0};
517 double alpha = Box.Alpha;
double alpha_squared = alpha * alpha;
518 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
520 size_t numberOfAtoms = 0;
521 size_t Oldsize = 0;
size_t Newsize = 0;
522 if(SystemComponents.hasPartialCharge[OLDComponent])
524 Oldsize = SystemComponents.Moleculesize[OLDComponent];
526 if(SystemComponents.hasPartialCharge[NEWComponent])
528 Newsize = SystemComponents.Moleculesize[NEWComponent];
530 numberOfAtoms = Oldsize + Newsize;
533 Initialize_EwaldVector_IdentitySwap<<<1,1>>>(Box, Box.kmax, temp, d_a, Old, Oldsize, Newsize, UpdateLocation, numberOfAtoms, OLDComponent, NEWComponent);
535 Complex* SameType = Box.AdsorbateEik;
Complex* CrossType = Box.FrameworkEik;
537 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
538 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
539 Fourier_Ewald_Diff<<<Nblock * 2, Nthread, Nthread *
sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum,
false, Nblock);
540 double sum[Nblock * 2];
double SameSum = 0.0;
double CrossSum = 0.0;
541 cudaMemcpy(sum, Blocksum, 2 * Nblock *
sizeof(
double), cudaMemcpyDeviceToHost);
543 for(
size_t i = 0; i < Nblock; i++){SameSum += sum[i];}
544 for(
size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += sum[i];}
547 if(SystemComponents.rigid[NEWComponent] && SystemComponents.hasPartialCharge[NEWComponent])
549 double delta_scale = 1.0;
550 double deltaExclusion = (SystemComponents.ExclusionIntra[NEWComponent] + SystemComponents.ExclusionAtom[NEWComponent]) * delta_scale;
551 SameSum -= deltaExclusion;
553 if(SystemComponents.rigid[OLDComponent] && SystemComponents.hasPartialCharge[OLDComponent])
555 double delta_scale = -1.0;
556 double deltaExclusion = (SystemComponents.ExclusionIntra[OLDComponent] + SystemComponents.ExclusionAtom[OLDComponent]) * delta_scale;
557 SameSum -= deltaExclusion;
560 return {SameSum, 2.0 * CrossSum};
566__global__
void Fourier_Ewald_Diff_LambdaChange(
Boxsize Box,
Complex* SameTypeEik,
Complex* CrossTypeEik,
Atoms Old,
double alpha_squared,
double prefactor, int3 kmax,
size_t Oldsize,
size_t Newsize,
double* Blocksum,
bool UseTempVector,
size_t Nblock,
double newScale)
568 extern __shared__
double sdata[];
569 size_t kxyz = blockIdx.x * blockDim.x + threadIdx.x;
570 int cache_id = threadIdx.x;
571 size_t i_within_block = kxyz - blockIdx.x * blockDim.x;
573 size_t kx_max = kmax.x;
574 size_t ky_max = kmax.y;
575 size_t kz_max = kmax.z;
576 size_t nvec = (kx_max + 1) * (2 * ky_max + 1) * (2 * kz_max + 1);
578 if(blockIdx.x >= Nblock) kxyz -= blockDim.x * Nblock;
581 sdata[i_within_block] = 0.0;
582 int kz = kxyz%(2 * kz_max + 1) - kz_max;
583 int kxy = kxyz/(2 * kz_max + 1);
584 int kx = kxy /(2 * ky_max + 1);
585 int ky = kxy %(2 * ky_max + 1) - ky_max;
586 double ksqr =
static_cast<double>(kx * kx + ky * ky + kz * kz);
588 if(Box.UseLAMMPSEwald)
590 const double lx = Box.Cell[0];
591 const double ly = Box.Cell[4];
592 const double lz = Box.Cell[8];
593 const double xy = Box.Cell[3];
594 const double xz = Box.Cell[6];
595 const double yz = Box.Cell[7];
596 const double ux = 2*M_PI/lx;
597 const double uy = 2*M_PI*(-xy)/lx/ly;
598 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
599 const double vy = 2*M_PI/ly;
600 const double vz = 2*M_PI*(-yz)/ly/lz;
601 const double wz = 2*M_PI/lz;
602 const double kvecx = kx*ux;
603 const double kvecy = kx*uy + ky*vy;
604 const double kvecz = kx*uz + ky*vz + kz*wz;
605 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
607 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
609 double3 ax = {Box.InverseCell[0], Box.InverseCell[3], Box.InverseCell[6]};
610 double3 ay = {Box.InverseCell[1], Box.InverseCell[4], Box.InverseCell[7]};
611 double3 az = {Box.InverseCell[2], Box.InverseCell[5], Box.InverseCell[8]};
612 size_t numberOfAtoms = Oldsize;
613 Complex cksum_old; cksum_old.real = 0.0; cksum_old.imag = 0.0;
614 Complex cksum_new; cksum_new.real = 0.0; cksum_new.imag = 0.0;
615 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
616 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
617 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
618 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
620 for(
size_t i=0; i<Oldsize; ++i)
622 Complex eik_temp = Box.eik_y[i + numberOfAtoms *
static_cast<size_t>(std::abs(ky))];
623 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
624 Complex eik_xy = multiply(Box.eik_x[i + numberOfAtoms *
static_cast<size_t>(kx)], eik_temp);
626 eik_temp = Box.eik_z[i + numberOfAtoms *
static_cast<size_t>(std::abs(kz))];
627 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
628 double charge = Old.charge[i];
629 double scaling = Old.scaleCoul[i];
630 Complex tempi = multiply(eik_xy, eik_temp);
632 cksum_old.real += scaling * charge * tempi.real;
633 cksum_old.imag += scaling * charge * tempi.imag;
634 cksum_new.real += newScale* charge * tempi.real;
635 cksum_new.imag += newScale* charge * tempi.imag;
637 double3 tempkvec = kvec_x + kvec_y + kvec_z;
638 double rksq = dot(tempkvec, tempkvec);
639 double temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
641 if(blockIdx.x < Nblock)
645 OldV.real = Box.tempEik[kxyz].real; OldV.imag = Box.tempEik[kxyz].imag;
649 OldV.real = SameTypeEik[kxyz].real; OldV.imag = SameTypeEik[kxyz].imag;
651 newV.real = OldV.real + cksum_new.real - cksum_old.real;
652 newV.imag = OldV.imag + cksum_new.imag - cksum_old.imag;
653 tempE += temp * ComplexNorm(newV);
654 tempE -= temp * ComplexNorm(OldV);
655 Box.tempEik[kxyz] = newV;
659 OldV.real = CrossTypeEik[kxyz].real; OldV.imag = CrossTypeEik[kxyz].imag;
660 tempE += temp * (OldV.real * (cksum_new.real - cksum_old.real) + OldV.imag * (cksum_new.imag - cksum_old.imag));
664 sdata[i_within_block] = tempE;
667 int i=blockDim.x / 2;
670 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
674 if(cache_id == 0) {Blocksum[blockIdx.x] = sdata[0];}
678double2 GPU_EwaldDifference_LambdaChange(
Boxsize& Box,
Atoms*& d_a,
Atoms& Old,
ForceField& FF,
double* Blocksum,
Components& SystemComponents,
size_t SelectedComponent, double2 oldScale, double2 newScale,
int MoveType)
680 if(FF.noCharges && !SystemComponents.hasPartialCharge[SelectedComponent])
return {0.0, 0.0};
681 double alpha = Box.Alpha;
double alpha_squared = alpha * alpha;
682 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
684 size_t numberOfAtoms = SystemComponents.Moleculesize[SelectedComponent];
685 size_t Oldsize = numberOfAtoms;
686 size_t Newsize = numberOfAtoms;
687 size_t chainsize = SystemComponents.Moleculesize[SelectedComponent];
689 bool UseTempVector =
false;
690 if(MoveType == CBCF_DELETION)
692 UseTempVector =
true;
695 Initialize_EwaldVector_LambdaChange<<<1,1>>>(Box, Box.kmax, d_a, Old, Oldsize, newScale);
698 size_t numberOfWaveVectors = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
699 size_t Nblock = 0;
size_t Nthread = 0; Setup_threadblock(numberOfWaveVectors, &Nblock, &Nthread);
701 Complex* SameType = Box.AdsorbateEik;
702 Complex* CrossType= Box.FrameworkEik;
704 Fourier_Ewald_Diff_LambdaChange<<<Nblock * 2, Nthread, Nthread *
sizeof(double)>>>(Box, SameType, CrossType, Old, alpha_squared, prefactor, Box.kmax, Oldsize, Newsize, Blocksum, UseTempVector, Nblock, newScale.y);
706 double Host_sum[Nblock * 2];
double SameSum = 0.0;
double CrossSum = 0.0;
707 cudaMemcpy(Host_sum, Blocksum, 2 * Nblock *
sizeof(
double), cudaMemcpyDeviceToHost);
708 for(
size_t i = 0; i < Nblock; i++){SameSum += Host_sum[i];}
709 for(
size_t i = Nblock; i < 2 * Nblock; i++){CrossSum += Host_sum[i];}
711 double delta_scale = std::pow(newScale.y,2) - std::pow(oldScale.y,2);
712 double deltaExclusion = (SystemComponents.ExclusionIntra[SelectedComponent] + SystemComponents.ExclusionAtom[SelectedComponent]) * delta_scale;
713 SameSum -= deltaExclusion;
715 return {SameSum, 2.0 * CrossSum};
718__global__
void Setup_Ewald_Vector(
Boxsize Box,
Complex* eik_x,
Complex* eik_y,
Complex* eik_z,
Atoms* System,
size_t numberOfAtoms,
size_t NumberOfComponents,
bool UseOffSet)
722 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
723 if(i < numberOfAtoms)
727 Complex tempcomplex; tempcomplex.real = 1.0; tempcomplex.imag = 0.0;
728 size_t comp = 0;
size_t totsize = 0;
729 for(
size_t ijk = 0; ijk < NumberOfComponents; ijk++)
731 size_t Atom_ijk = System[ijk].size;
733 if(atom_i >= totsize)
741 size_t HalfAllocateSize = System[comp].Allocate_size / 2;
742 atom_i += HalfAllocateSize;
745 pos = System[comp].pos[atom_i];
746 eik_x[i + 0 * numberOfAtoms] = tempcomplex;
747 eik_y[i + 0 * numberOfAtoms] = tempcomplex;
748 eik_z[i + 0 * numberOfAtoms] = tempcomplex;
749 double3 s; matrix_multiply_by_vector(Box.InverseCell, pos, s); s*=2*M_PI;
750 tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); eik_x[i + 1 * numberOfAtoms] = tempcomplex;
751 tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); eik_y[i + 1 * numberOfAtoms] = tempcomplex;
752 tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); eik_z[i + 1 * numberOfAtoms] = tempcomplex;
754 for(
size_t kx = 2; kx <= Box.kmax.x; ++kx)
756 eik_x[i + kx * numberOfAtoms] = multiply(eik_x[i + (kx - 1) * numberOfAtoms], eik_x[i + 1 * numberOfAtoms]);
760 for(
size_t ky = 2; ky <= Box.kmax.y; ++ky)
762 eik_y[i + ky * numberOfAtoms] = multiply(eik_y[i + (ky - 1) * numberOfAtoms], eik_y[i + 1 * numberOfAtoms]);
765 for(
size_t kz = 2; kz <= Box.kmax.z; ++kz)
767 eik_z[i + kz * numberOfAtoms] = multiply(eik_z[i + (kz - 1) * numberOfAtoms], eik_z[i + 1 * numberOfAtoms]);
774__global__
void TotalEwald(
Atoms* d_a,
Boxsize Box,
double* BlockSum,
Complex* eik_x,
Complex* eik_y,
Complex* eik_z,
Complex* Eik,
size_t totAtom,
size_t Ncomp,
size_t NAtomPerThread,
size_t residueAtoms)
776 __shared__ double3 sdata[128];
777 int cache_id = threadIdx.x;
778 size_t total_ij = blockIdx.x * blockDim.x + threadIdx.x;
779 size_t ij_within_block = total_ij - blockIdx.x * blockDim.x;
781 size_t kxyz = blockIdx.x;
782 size_t kx_max = Box.kmax.x;
783 size_t ky_max = Box.kmax.y;
784 size_t kz_max = Box.kmax.z;
786 int kz = kxyz%(2 * kz_max + 1) - kz_max;
787 int kxy = kxyz/(2 * kz_max + 1);
788 int kx = kxy /(2 * ky_max + 1);
789 int ky = kxy %(2 * ky_max + 1) - ky_max;
790 double ksqr =
static_cast<double>(kx * kx + ky * ky + kz * kz);
792 double alpha = Box.Alpha;
double alpha_squared = alpha * alpha;
793 double prefactor = Box.Prefactor * (2.0 * M_PI / Box.Volume);
795 double3 ax = {Box.InverseCell[0], Box.InverseCell[3], Box.InverseCell[6]};
796 double3 ay = {Box.InverseCell[1], Box.InverseCell[4], Box.InverseCell[7]};
797 double3 az = {Box.InverseCell[2], Box.InverseCell[5], Box.InverseCell[8]};
798 double3 kvec_x = ax * 2.0 * M_PI * (double) kx;
799 double3 kvec_y = ay * 2.0 * M_PI * (double) ky;
800 double3 kvec_z = az * 2.0 * M_PI * (double) kz;
801 double factor = (kx == 0) ? (1.0 * prefactor) : (2.0 * prefactor);
803 double3 tempkvec = kvec_x + kvec_y + kvec_z;
804 double rksq = dot(tempkvec, tempkvec);
807 if(Box.UseLAMMPSEwald)
809 const double lx = Box.Cell[0];
810 const double ly = Box.Cell[4];
811 const double lz = Box.Cell[8];
812 const double xy = Box.Cell[3];
813 const double xz = Box.Cell[6];
814 const double yz = Box.Cell[7];
815 const double ux = 2*M_PI/lx;
816 const double uy = 2*M_PI*(-xy)/lx/ly;
817 const double uz = 2*M_PI*(xy*yz - ly*xz)/lx/ly/lz;
818 const double vy = 2*M_PI/ly;
819 const double vz = 2*M_PI*(-yz)/ly/lz;
820 const double wz = 2*M_PI/lz;
821 const double kvecx = kx*ux;
822 const double kvecy = kx*uy + ky*vy;
823 const double kvecz = kx*uz + ky*vz + kz*wz;
824 ksqr = kvecx*kvecx + kvecy*kvecy + kvecz*kvecz;
826 Complex cksum; cksum.real = 0.0; cksum.imag = 0.0;
827 if((ksqr > 1e-10) && (ksqr < Box.ReciprocalCutOff))
829 temp = factor * std::exp((-0.25 / alpha_squared) * rksq) / rksq;
830 for(
size_t a = 0; a < NAtomPerThread; a++)
832 size_t Atom = a + NAtomPerThread * ij_within_block;
833 size_t AbsoluteAtom = Atom;
size_t comp = 0;
size_t totsize = 0;
834 for(
size_t it_comp = 0; it_comp < Ncomp; it_comp++)
836 size_t Atom_ijk = d_a[it_comp].size;
841 Atom -= d_a[it_comp].size;
844 Complex eik_temp = eik_y[AbsoluteAtom + totAtom *
static_cast<size_t>(std::abs(ky))];
845 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
846 Complex eik_xy = multiply(eik_x[AbsoluteAtom + totAtom *
static_cast<size_t>(kx)], eik_temp);
847 eik_temp = eik_z[AbsoluteAtom + totAtom *
static_cast<size_t>(std::abs(kz))];
848 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
850 double charge = d_a[comp].charge[Atom];
851 double scaling = d_a[comp].scaleCoul[Atom];
852 Complex tempi = multiply(eik_xy, eik_temp);
853 cksum.real += scaling * charge * tempi.real;
854 cksum.imag += scaling * charge * tempi.imag;
858 if(ij_within_block < residueAtoms)
862 size_t Atom = blockDim.x * NAtomPerThread + ij_within_block;
863 size_t AbsoluteAtom = Atom;
size_t comp = 0;
size_t totsize = 0;
864 for(
size_t it_comp = 0; it_comp < Ncomp; it_comp++)
866 size_t Atom_ijk = d_a[it_comp].size;
871 Atom -= d_a[it_comp].size;
874 Complex eik_temp = eik_y[AbsoluteAtom + totAtom *
static_cast<size_t>(std::abs(ky))];
875 eik_temp.imag = ky>=0 ? eik_temp.imag : -eik_temp.imag;
876 Complex eik_xy = multiply(eik_x[AbsoluteAtom + totAtom *
static_cast<size_t>(kx)], eik_temp);
877 eik_temp = eik_z[AbsoluteAtom + totAtom *
static_cast<size_t>(std::abs(kz))];
878 eik_temp.imag = kz>=0 ? eik_temp.imag : -eik_temp.imag;
880 double charge = d_a[comp].charge[Atom];
881 double scaling = d_a[comp].scaleCoul[Atom];
882 Complex tempi = multiply(eik_xy, eik_temp);
883 cksum.real += scaling * charge * tempi.real;
884 cksum.imag += scaling * charge * tempi.imag;
889 sdata[ij_within_block].x = cksum.real;
890 sdata[ij_within_block].y = cksum.imag;
893 int i=blockDim.x / 2;
898 sdata[cache_id].x += sdata[cache_id + i].x;
899 sdata[cache_id].y += sdata[cache_id + i].y;
906 Complex cksum; cksum.real = sdata[0].x; cksum.imag = sdata[0].y;
907 double NORM = ComplexNorm(cksum);
908 BlockSum[blockIdx.x] = temp * NORM;
909 Eik[blockIdx.x].real = sdata[0].x;
910 Eik[blockIdx.x].imag = sdata[0].y;
914__global__
double Calculate_Intra_Self_Exclusion(
Boxsize Box,
Atoms* System,
double* BlockSum,
size_t SelectedComponent)
916 double E = 0.0;
double prefactor_self = Box.Prefactor * Box.Alpha / std::sqrt(M_PI);
917 extern __shared__
double sdata[];
918 sdata[threadIdx.x] = 0.0;
int cache_id = threadIdx.x;
925 size_t THREADIdx = blockIdx.x * blockDim.x + threadIdx.x;
926 size_t Molsize = System[SelectedComponent].Molsize;
927 size_t AtomIdx = THREADIdx * Molsize;
929 size_t totalmol = System[SelectedComponent].size / Molsize;
930 if(AtomIdx < System[SelectedComponent].size)
933 for(
size_t i = AtomIdx; i != Molsize + AtomIdx; i++)
935 double factorA = System[SelectedComponent].scaleCoul[i] * System[SelectedComponent].charge[i];
936 double3 posA = System[SelectedComponent].pos[i];
938 for(
size_t j = i; j != Molsize + AtomIdx; j++)
943 E += prefactor_self * factorA * factorA;
949 double factorB = System[SelectedComponent].scaleCoul[j] * System[SelectedComponent].charge[j];
950 double3 posB = System[SelectedComponent].pos[j];
951 double3 posvec = posA - posB;
952 PBC(posvec, Box.Cell, Box.InverseCell, Box.Cubic);
953 double rr_dot = dot(posvec, posvec);
954 double r = std::sqrt(rr_dot);
955 E += Box.Prefactor * factorA * factorB * std::erf(Box.Alpha * r) / r;
962 sdata[threadIdx.x] = -E;
965 int i=blockDim.x / 2;
968 if(cache_id < i) {sdata[cache_id] += sdata[cache_id + i];}
974 BlockSum[blockIdx.x] += sdata[0];
981 size_t NTotalAtom = 0;
983 size_t Nthread = 128;
988 Atoms* d_a = Sim.d_a;
990 for(
size_t i = 0; i < SystemComponents.Total_Components; i++)
991 NTotalAtom += SystemComponents.NumberOfMolecule_for_Component[i] * SystemComponents.Moleculesize[i];
994 size_t NAtomPerThread = NTotalAtom / Nthread;
995 size_t residueAtoms = NTotalAtom % Nthread;
997 Setup_threadblock(NTotalAtom, &Nblock, &Nthread);
999 Complex* eikx; cudaMalloc(&eikx, NTotalAtom * (Box.kmax.x + 1) *
sizeof(
Complex));
1000 Complex* eiky; cudaMalloc(&eiky, NTotalAtom * (Box.kmax.y + 1) *
sizeof(
Complex));
1001 Complex* eikz; cudaMalloc(&eikz, NTotalAtom * (Box.kmax.z + 1) *
sizeof(
Complex));
1002 Setup_Ewald_Vector<<<Nblock, Nthread>>>(Box, eikx, eiky, eikz, d_a, NTotalAtom, SystemComponents.Total_Components, UseOffSet);
1004 Nblock = (Box.kmax.x + 1) * (2 * Box.kmax.y + 1) * (2 * Box.kmax.z + 1);
1006 if(Nblock > Sim.Nblocks)
1008 printf(
"Need to Allocate more space for blocksum\n");
1009 cudaMalloc(&Sim.Blocksum, Nblock *
sizeof(
double));
1015 TotalEwald<<<Nblock, Nthread, Nthread *
sizeof(double)>>>(d_a, Box, Sim.Blocksum, eikx, eiky, eikz, Box.tempEik, NTotalAtom, SystemComponents.Total_Components, NAtomPerThread, residueAtoms);
1016 cudaFree(eikx); cudaFree(eiky); cudaFree(eikz);
1018 double HostTotEwald[Nblock];
1041 double GGFourier = 0.0; cudaMemcpy(HostTotEwald, Sim.Blocksum, Nblock *
sizeof(
double), cudaMemcpyDeviceToHost);
1051 size_t maxblock = Nblock;
1052 for(
size_t i = SystemComponents.NComponents.y; i < SystemComponents.NComponents.x; i++)
1054 if(SystemComponents.NumberOfMolecule_for_Component[i] == 0)
continue;
1055 size_t Exclusion_Nblock = 0;
size_t Exclusion_Nthread = 0;
1056 Setup_threadblock(SystemComponents.NumberOfMolecule_for_Component[i], &Exclusion_Nblock, &Exclusion_Nthread);
1057 printf(
"Component %zu, Nblock: %zu, Nthread: %zu\n", i, Exclusion_Nblock, Exclusion_Nthread);
1058 Calculate_Intra_Self_Exclusion<<<Exclusion_Nblock, Exclusion_Nthread, Exclusion_Nthread *
sizeof(double)>>>(Box, d_a, Sim.Blocksum, i); checkCUDAErrorEwald(
"error Calculating Intra Self Exclusion for Ewald Summation for Total Ewald summation!!!");
1064 cudaMemcpy(HostTotEwald, Sim.Blocksum, maxblock *
sizeof(
double), cudaMemcpyDeviceToHost);
1065 for(
size_t i = 0; i < maxblock; i++) E.GGEwaldE += HostTotEwald[i];
1066 printf(
"GPU Ewald: %.5f\n", E.GGEwaldE);
Definition data_struct.h:746
Definition data_struct.h:819
Definition data_struct.h:46
Definition data_struct.h:843
Definition data_struct.h:794
Definition data_struct.h:615
Definition data_struct.h:1027