diff --git a/src/cudafuncs.cu b/src/cudafuncs.cu index 5b9e809..a040c35 100644 --- a/src/cudafuncs.cu +++ b/src/cudafuncs.cu @@ -6,22 +6,23 @@ /* this is an explicit definition for atomicAdd, to be safe */ __device__ double atomicAdd2(double* address, double val) { - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } - while (assumed != old); - return __longlong_as_double(old); + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } + while (assumed != old); + return __longlong_as_double(old); } // minimal data to send to GPU. this is all that's needed to calc forces. typedef struct atom_t { - double pos[3]={0,0,0}; + double pos[3]= {0,0,0}; double eps=0; // lj double sig=0; // lj double charge=0; - double f[3]={0,0,0}; // force + double f[3]= {0,0,0}; // force int molid=0; int frozen=0; double u[3] = {0,0,0}; // dipole @@ -35,7 +36,7 @@ void calculateForceKernel(cuda_atom * atom_list, int N, double cutoffD, double * int i = threadIdx.x + blockDim.x * blockIdx.x; // only run for real atoms (no ghost threads) - if(i kmax*kmax) continue; - /* get reciprocal lattice vectors */ - for (p=0; p<3; p++) { - for (q=0, k[p] = 0; q < 3; q++) { - k[p] += 2.0*M_PI*reciprocal_basis[3*q+p] * l[q]; - } - } - k_sq = k[0]*k[0] + k[1]*k[1] + k[2]*k[2]; - - holder = chargeprod * invV * fourPI * k[n] * - exp(-k_sq/(4*alpha*alpha))* - sin(k[0]*dimg[0] + k[1]*dimg[1] + k[2]*dimg[2])/k_sq * 2; // times 2 b/c half-Ewald sphere + // real-space + if (rimg <= cutoff && (anchoratom.molid < atom_list[j].molid)) { // non-duplicated pairs, not intramolecular, not beyond cutoff + chargeprod = anchoratom.charge * atom_list[j].charge; + for (n=0; n<3; n++) u[n] = dimg[n]/rimg; + for (n=0; n<3; n++) { + holder = -((-2.0*chargeprod*alpha*exp(-alpha*alpha*rsq))/(sqrtPI*rimg) - (chargeprod*erfc(alpha*rimg)/rsq))*u[n]; af[n] += holder; atomicAdd2(&(atom_list[j].f[n]), -holder); + } + } + // k-space + if (kspace && (anchoratom.molid < atom_list[j].molid)) { + chargeprod = anchoratom.charge * atom_list[j].charge; + + for (n=0; n<3; n++) { + for (l[0] = 0; l[0] <= kmax; l[0]++) { + for (l[1] = (!l[0] ? 0 : -kmax); l[1] <= kmax; l[1]++) { + for (l[2] = ((!l[0] && !l[1]) ? 1 : -kmax); l[2] <= kmax; l[2]++) { + // skip if norm is out of sphere + if (l[0]*l[0] + l[1]*l[1] + l[2]*l[2] > kmax*kmax) continue; + /* get reciprocal lattice vectors */ + for (p=0; p<3; p++) { + for (q=0, k[p] = 0; q < 3; q++) { + k[p] += 2.0*M_PI*reciprocal_basis[3*q+p] * l[q]; + } + } + k_sq = k[0]*k[0] + k[1]*k[1] + k[2]*k[2]; + + holder = chargeprod * invV * fourPI * k[n] * + exp(-k_sq/(4*alpha*alpha))* + sin(k[0]*dimg[0] + k[1]*dimg[1] + k[2]*dimg[2])/k_sq * 2; // times 2 b/c half-Ewald sphere + + af[n] += holder; + atomicAdd2(&(atom_list[j].f[n]), -holder); + + } // end for l[2], n + } // end for l[1], m + } // end for l[0], l + } // end 3d + } - } // end for l[2], n - } // end for l[1], m - } // end for l[0], l - } // end 3d - } - - } // end pair loop j + } // end pair loop j // finally add ES contribution to anchor-atom - for (n=0;n<3;n++) atomicAdd2(&(atom_list[i].f[n]), af[n]); + for (n=0; n<3; n++) atomicAdd2(&(atom_list[i].f[n]), af[n]); } // end ES component // ============================================================ // Polarization @@ -225,45 +229,48 @@ void calculateForceKernel(cuda_atom * atom_list, int N, double cutoffD, double * double u_j[3]; // loop all pair atoms for (int j=i+1; j cutoff) continue; // skip outside cutoff r = rimg; - x = dimg[0]; y = dimg[1]; z = dimg[2]; + x = dimg[0]; + y = dimg[1]; + z = dimg[2]; x2 = x*x; y2 = y*y; z2 = z*z; @@ -272,18 +279,18 @@ void calculateForceKernel(cuda_atom * atom_list, int N, double cutoffD, double * rinv = 1./r; r2inv = rinv*rinv; r3inv = r2inv*rinv; - for (n=0;n<3;n++) u_j[n] = atom_list[j].u[n]; + for (n=0; n<3; n++) u_j[n] = atom_list[j].u[n]; // (1) u_i -- q_j if (atom_list[j].charge != 0 && anchoratom.polar != 0) { common_factor = atom_list[j].charge * r3inv; - af[0] += common_factor*((u_i[0]*(r2inv*(-2*x2 + y2 + z2) - cc2inv*(y2 + z2))) + (u_i[1]*(r2inv*(-3*x*y) + cc2inv*x*y)) + (u_i[2]*(r2inv*(-3*x*z) + cc2inv*x*z))); + af[0] += common_factor*((u_i[0]*(r2inv*(-2*x2 + y2 + z2) - cc2inv*(y2 + z2))) + (u_i[1]*(r2inv*(-3*x*y) + cc2inv*x*y)) + (u_i[2]*(r2inv*(-3*x*z) + cc2inv*x*z))); + + af[1] += common_factor*(u_i[0]*(r2inv*(-3*x*y) + cc2inv*x*y) + u_i[1]*(r2inv*(-2*y2 + x2 + z2) - cc2inv*(x2 + z2)) + u_i[2]*(r2inv*(-3*y*z) + cc2inv*y*z)); - af[1] += common_factor*(u_i[0]*(r2inv*(-3*x*y) + cc2inv*x*y) + u_i[1]*(r2inv*(-2*y2 + x2 + z2) - cc2inv*(x2 + z2)) + u_i[2]*(r2inv*(-3*y*z) + cc2inv*y*z)); + af[2] += common_factor*(u_i[0]*(r2inv*(-3*x*z) + cc2inv*x*z) + u_i[1]*(r2inv*(-3*y*z) + cc2inv*y*z) + u_i[2]*(r2inv*(-2*z2 + x2 + y2) - cc2inv*(x2 + y2))); - af[2] += common_factor*(u_i[0]*(r2inv*(-3*x*z) + cc2inv*x*z) + u_i[1]*(r2inv*(-3*y*z) + cc2inv*y*z) + u_i[2]*(r2inv*(-2*z2 + x2 + y2) - cc2inv*(x2 + y2))); - } // (2) u_j -- q_i @@ -303,7 +310,7 @@ void calculateForceKernel(cuda_atom * atom_list, int N, double cutoffD, double * r7inv = r5inv*r2inv; udotu = u_i[0]*u_j[0] + u_i[1]*u_j[1] + u_i[2]*u_j[2]; uidotr = u_i[0]*dimg[0] + u_i[1]*dimg[1] + u_i[2]*dimg[2]; - ujdotr = u_j[0]*dimg[0] + u_j[1]*dimg[1] + u_j[2]*dimg[2]; + ujdotr = u_j[0]*dimg[0] + u_j[1]*dimg[1] + u_j[2]*dimg[2]; t1 = exp(-damp*r); t2 = 1. + damp*r + 0.5*damp*damp*r2; @@ -320,12 +327,12 @@ void calculateForceKernel(cuda_atom * atom_list, int N, double cutoffD, double * } // apply Newton for pair. - for (n=0;n<3;n++) { + for (n=0; n<3; n++) { atomicAdd2(&(atom_list[i].f[n]), af[n]); - atomicAdd2(&(atom_list[j].f[n]), -af[n]); + atomicAdd2(&(atom_list[j].f[n]), -af[n]); } - } // end pair loop with atoms j + } // end pair loop with atoms j } // end polarization forces } // end if i