// ************************************************************************** // re_squared.cu // ------------------- // W. Michael Brown // // Device code for RE-Squared potential acceleration // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : Fri May 06 2011 // email : brownw@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_ellipsoid_extra.h" #endif ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) { numtyp ans; ans = m2[0]*m[4]*m[8] - m2[0]*m[5]*m[7] - m[3]*m2[1]*m[8] + m[3]*m2[2]*m[7] + m[6]*m2[1]*m[5] - m[6]*m2[2]*m[4] + m[0]*m2[4]*m[8] - m[0]*m2[5]*m[7] - m2[3]*m[1]*m[8] + m2[3]*m[2]*m[7] + m[6]*m[1]*m2[5] - m[6]*m[2]*m2[4] + m[0]*m[4]*m2[8] - m[0]*m[5]*m2[7] - m[3]*m[1]*m2[8] + m[3]*m[2]*m2[7] + m2[6]*m[1]*m[5] - m2[6]*m[2]*m[4]; return ans; } __kernel void k_resquared(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, const __global numtyp4 *restrict well, const __global numtyp *restrict splj, const __global numtyp2 *restrict sig_eps, const int ntypes, const __global int *dev_nbor, const int stride, __global acctyp4 *restrict ans, const int astride, __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, const int inum, const int t_per_atom) { int tid, ii, offset; atom_info(t_per_atom,ii,tid,offset); __local numtyp sp_lj[4]; sp_lj[0]=splj[0]; sp_lj[1]=splj[1]; sp_lj[2]=splj[2]; sp_lj[3]=splj[3]; __local numtyp b_alpha, cr60; b_alpha=(numtyp)45.0/(numtyp)56.0; cr60=ucl_cbrt((numtyp)60.0); acctyp energy=(acctyp)0; acctyp4 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp4 tor; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; acctyp virial[6]; for (int i=0; i<6; i++) virial[i]=(acctyp)0; if (iibody) numtyp aTe1[9]; // A'*E numtyp gamma1[9]; // A'*S^2*A numtyp sa1[9]; // S^2*A; numtyp lA1_0[9], lA1_1[9], lA1_2[9]; // -A*rotation generator (x,y, or z) numtyp lAtwo1_0[9], lAtwo1_1[9], lAtwo1_2[9]; // A'*S^2*lA numtyp lAsa1_0[9], lAsa1_1[9], lAsa1_2[9]; // lAtwo+lA'*sa numtyp4 ishape; ishape=shape[itype]; numtyp4 ishape2; ishape2.x=ishape.x*ishape.x; ishape2.y=ishape.y*ishape.y; ishape2.z=ishape.z*ishape.z; numtyp ilshape = ishape.x*ishape.y*ishape.z; { numtyp aTs[9]; // A1'*S1^2 gpu_quat_to_mat_trans(q,i,a1); gpu_transpose_times_diag3(a1,well[itype],aTe1); gpu_transpose_times_diag3(a1,ishape2,aTs); gpu_diag_times3(ishape2,a1,sa1); gpu_times3(aTs,a1,gamma1); gpu_rotation_generator_x(a1,lA1_0); gpu_rotation_generator_y(a1,lA1_1); gpu_rotation_generator_z(a1,lA1_2); gpu_times3(aTs,lA1_0,lAtwo1_0); gpu_transpose_times3(lA1_0,sa1,lAsa1_0); gpu_plus3(lAsa1_0,lAtwo1_0,lAsa1_0); gpu_times3(aTs,lA1_1,lAtwo1_1); gpu_transpose_times3(lA1_1,sa1,lAsa1_1); gpu_plus3(lAsa1_1,lAtwo1_1,lAsa1_1); gpu_times3(aTs,lA1_2,lAtwo1_2); gpu_transpose_times3(lA1_2,sa1,lAsa1_2); gpu_plus3(lAsa1_2,lAtwo1_2,lAsa1_2); } ishape2.x=ucl_recip(ishape2.x); ishape2.y=ucl_recip(ishape2.y); ishape2.z=ucl_recip(ishape2.z); numtyp factor_lj; for ( ; nborbody) numtyp gamma2[9]; // A'*S^2*A numtyp4 jshape; jshape=shape[jtype]; numtyp4 jshape2; jshape2.x=jshape.x*jshape.x; jshape2.y=jshape.y*jshape.y; jshape2.z=jshape.z*jshape.z; { numtyp aTs[9]; // A1'*S1^2 gpu_quat_to_mat_trans(q,j,a2); gpu_transpose_times_diag3(a2,jshape2,aTs); gpu_times3(aTs,a2,gamma2); } numtyp temp[9], s[3], z1[3], z2[3], v1[3], v2[3]; numtyp sigma12, sigma1, sigma2; gpu_plus3(gamma1,gamma2,temp); gpu_mldivide3(temp,rhat,s,err_flag); sigma12 = ucl_rsqrt((numtyp)0.5*gpu_dot3(s,rhat)); gpu_times_column3(a1,rhat,z1); gpu_times_column3(a2,rhat,z2); v1[0] = z1[0]*ishape2.x; v1[1] = z1[1]*ishape2.y; v1[2] = z1[2]*ishape2.z; v2[0] = z2[0]/jshape2.x; v2[1] = z2[1]/jshape2.y; v2[2] = z2[2]/jshape2.z; sigma1 = ucl_sqrt(gpu_dot3(z1,v1)); sigma2 = ucl_sqrt(gpu_dot3(z2,v2)); numtyp H12[9]; numtyp dH; H12[0] = gamma1[0]*sigma1+gamma2[0]*sigma2; H12[1] = gamma1[1]*sigma1+gamma2[1]*sigma2; H12[2] = gamma1[2]*sigma1+gamma2[2]*sigma2; H12[3] = gamma1[3]*sigma1+gamma2[3]*sigma2; H12[4] = gamma1[4]*sigma1+gamma2[4]*sigma2; H12[5] = gamma1[5]*sigma1+gamma2[5]*sigma2; H12[6] = gamma1[6]*sigma1+gamma2[6]*sigma2; H12[7] = gamma1[7]*sigma1+gamma2[7]*sigma2; H12[8] = gamma1[8]*sigma1+gamma2[8]*sigma2; dH=gpu_det3(H12); numtyp sigma1p2, sigma2p2, lambda, nu; sigma1p2 = sigma1*sigma1; sigma2p2 = sigma2*sigma2; numtyp jlshape = jshape.x*jshape.y*jshape.z; lambda = ilshape*sigma1p2 + jlshape*sigma2p2; sigma1=ucl_recip(sigma1); sigma2=ucl_recip(sigma2); nu = ucl_rsqrt((sigma1+sigma2)/dH); gpu_times3(aTe1,a1,temp); numtyp sigma, epsilon; int mtype=fast_mul(ntypes,itype)+jtype; sigma = sig_eps[mtype].x; epsilon = sig_eps[mtype].y*factor_lj; numtyp w[3], temp2[9]; numtyp h12,eta,chi,sprod,sigh,tprod; numtyp aTe2[9]; // A'*E gpu_transpose_times_diag3(a2,well[jtype],aTe2); gpu_times3(aTe2,a2,temp2); gpu_plus3(temp,temp2,temp); gpu_mldivide3(temp,rhat,w,err_flag); h12 = ucl_recip(rnorm)-sigma12; eta = lambda/nu; chi = (numtyp)2.0*gpu_dot3(rhat,w); sprod = ilshape * jlshape; sigh = sigma/h12; tprod = eta*chi*sigh; numtyp stemp, Ua; stemp = h12*(numtyp)0.5; Ua = (ishape.x+stemp)*(ishape.y+stemp)* (ishape.z+stemp)*(jshape.x+stemp)* (jshape.y+stemp)*(jshape.z+stemp); Ua = ((numtyp)1.0+(numtyp)3.0*tprod)*sprod/Ua; Ua = epsilon*Ua/(numtyp)-36.0; numtyp Ur; stemp = h12/cr60; Ur = (ishape.x+stemp)*(ishape.y+stemp)* (ishape.z+stemp)*(jshape.x+stemp)* (jshape.y+stemp)*(jshape.z+stemp); Ur = ((numtyp)1.0+b_alpha*tprod)*sprod/Ur; numtyp sigh6=sigh*sigh*sigh; sigh6*=sigh6; Ur = epsilon*Ur*sigh6/(numtyp)2025.0; energy+=Ua+Ur; // force numtyp vsigma1[3], vsigma2[3], gsigma1[9], gsigma2[9]; numtyp sec, sigma12p3, sigma1p3, sigma2p3; sec = sigma*eta*chi; sigma12p3 = sigma12*sigma12*sigma12; sigma1p3 = sigma1/sigma1p2; sigma2p3 = sigma2/sigma2p2; vsigma1[0] = -sigma1p3*v1[0]; vsigma1[1] = -sigma1p3*v1[1]; vsigma1[2] = -sigma1p3*v1[2]; vsigma2[0] = -sigma2p3*v2[0]; vsigma2[1] = -sigma2p3*v2[1]; vsigma2[2] = -sigma2p3*v2[2]; gsigma1[0] = -gamma1[0]*sigma1p2; gsigma1[1] = -gamma1[1]*sigma1p2; gsigma1[2] = -gamma1[2]*sigma1p2; gsigma1[3] = -gamma1[3]*sigma1p2; gsigma1[4] = -gamma1[4]*sigma1p2; gsigma1[5] = -gamma1[5]*sigma1p2; gsigma1[6] = -gamma1[6]*sigma1p2; gsigma1[7] = -gamma1[7]*sigma1p2; gsigma1[8] = -gamma1[8]*sigma1p2; gsigma2[0] = -gamma2[0]*sigma2p2; gsigma2[1] = -gamma2[1]*sigma2p2; gsigma2[2] = -gamma2[2]*sigma2p2; gsigma2[3] = -gamma2[3]*sigma2p2; gsigma2[4] = -gamma2[4]*sigma2p2; gsigma2[5] = -gamma2[5]*sigma2p2; gsigma2[6] = -gamma2[6]*sigma2p2; gsigma2[7] = -gamma2[7]*sigma2p2; gsigma2[8] = -gamma2[8]*sigma2p2; numtyp tsig1sig2, tdH, teta1, teta2; numtyp fourw[3], spr[3]; tsig1sig2 = eta/((numtyp)2.0*(sigma1+sigma2)); tdH = eta/((numtyp)2.0*dH); teta1 = (numtyp)2.0*eta/lambda; teta2 = teta1*jlshape/sigma2p3; teta1 = teta1*ilshape/sigma1p3; fourw[0] = (numtyp)4.0*w[0]; fourw[1] = (numtyp)4.0*w[1]; fourw[2] = (numtyp)4.0*w[2]; spr[0] = (numtyp)0.5*sigma12p3*s[0]; spr[1] = (numtyp)0.5*sigma12p3*s[1]; spr[2] = (numtyp)0.5*sigma12p3*s[2]; numtyp hsec, dspu, pbsu; stemp = ucl_recip(ishape.x*(numtyp)2.0+h12)+ ucl_recip(ishape.y*(numtyp)2.0+h12)+ ucl_recip(ishape.z*(numtyp)2.0+h12)+ ucl_recip(jshape.x*(numtyp)2.0+h12)+ ucl_recip(jshape.y*(numtyp)2.0+h12)+ ucl_recip(jshape.z*(numtyp)2.0+h12); hsec = ucl_recip(h12+(numtyp)3.0*sec); dspu = ucl_recip(h12)-hsec+stemp; pbsu = (numtyp)3.0*sigma*hsec; numtyp dspr, pbsr; stemp = ucl_recip(ishape.x*cr60+h12)+ ucl_recip(ishape.y*cr60+h12)+ ucl_recip(ishape.z*cr60+h12)+ ucl_recip(jshape.x*cr60+h12)+ ucl_recip(jshape.y*cr60+h12)+ ucl_recip(jshape.z*cr60+h12); hsec = ucl_recip(h12+b_alpha*sec); dspr = (numtyp)7.0/h12-hsec+stemp; pbsr = b_alpha*sigma*hsec; numtyp dH12[9]; numtyp dUa, dUr, deta, dchi, ddH, dh12; numtyp dsigma1, dsigma2; #pragma unroll for (int i=0; i<3; i++) { numtyp u[3], u1[3], u2[3]; u[0] = -rhat[i]*rhat[0]; u[1] = -rhat[i]*rhat[1]; u[2] = -rhat[i]*rhat[2]; u[i] += (numtyp)1.0; u[0] *= rnorm; u[1] *= rnorm; u[2] *= rnorm; gpu_times_column3(a1,u,u1); gpu_times_column3(a2,u,u2); dsigma1=gpu_dot3(u1,vsigma1); dsigma2=gpu_dot3(u2,vsigma2); dH12[0] = dsigma1*gsigma1[0]+dsigma2*gsigma2[0]; dH12[1] = dsigma1*gsigma1[1]+dsigma2*gsigma2[1]; dH12[2] = dsigma1*gsigma1[2]+dsigma2*gsigma2[2]; dH12[3] = dsigma1*gsigma1[3]+dsigma2*gsigma2[3]; dH12[4] = dsigma1*gsigma1[4]+dsigma2*gsigma2[4]; dH12[5] = dsigma1*gsigma1[5]+dsigma2*gsigma2[5]; dH12[6] = dsigma1*gsigma1[6]+dsigma2*gsigma2[6]; dH12[7] = dsigma1*gsigma1[7]+dsigma2*gsigma2[7]; dH12[8] = dsigma1*gsigma1[8]+dsigma2*gsigma2[8]; ddH = det_prime(H12,dH12); deta = (dsigma1+dsigma2)*tsig1sig2; deta -= ddH*tdH; deta -= dsigma1*teta1+dsigma2*teta2; dchi = gpu_dot3(u,fourw); dh12 = rhat[i]+gpu_dot3(u,spr); dUa = pbsu*(eta*dchi+deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi+deta*chi)-dh12*dspr; numtyp force=dUr*Ur+dUa*Ua; if (i==0) { f.x+=force; if (vflag>0) virial[0]+=-r[0]*force; } else if (i==1) { f.y+=force; if (vflag>0) { virial[1]+=-r[1]*force; virial[3]+=-r[0]*force; } } else { f.z+=force; if (vflag>0) { virial[2]+=-r[2]*force; virial[4]+=-r[0]*force; virial[5]+=-r[1]*force; } } } // torque on i sigma1=ucl_recip(sigma1); numtyp fwae[3], p[3]; gpu_row_times3(fourw,aTe1,fwae); { gpu_times_column3(lA1_0,rhat,p); dsigma1 = gpu_dot3(p,vsigma1); dH12[0] = lAsa1_0[0]*sigma1+dsigma1*gsigma1[0]; dH12[1] = lAsa1_0[1]*sigma1+dsigma1*gsigma1[1]; dH12[2] = lAsa1_0[2]*sigma1+dsigma1*gsigma1[2]; dH12[3] = lAsa1_0[3]*sigma1+dsigma1*gsigma1[3]; dH12[4] = lAsa1_0[4]*sigma1+dsigma1*gsigma1[4]; dH12[5] = lAsa1_0[5]*sigma1+dsigma1*gsigma1[5]; dH12[6] = lAsa1_0[6]*sigma1+dsigma1*gsigma1[6]; dH12[7] = lAsa1_0[7]*sigma1+dsigma1*gsigma1[7]; dH12[8] = lAsa1_0[8]*sigma1+dsigma1*gsigma1[8]; ddH = det_prime(H12,dH12); deta = tsig1sig2*dsigma1-tdH*ddH; deta -= teta1*dsigma1; numtyp tempv[3]; gpu_times_column3(lA1_0,w,tempv); dchi = -gpu_dot3(fwae,tempv); gpu_times_column3(lAtwo1_0,spr,tempv); dh12 = -gpu_dot3(s,tempv); dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; tor.x -= (dUa*Ua+dUr*Ur); } { gpu_times_column3(lA1_1,rhat,p); dsigma1 = gpu_dot3(p,vsigma1); dH12[0] = lAsa1_1[0]*sigma1+dsigma1*gsigma1[0]; dH12[1] = lAsa1_1[1]*sigma1+dsigma1*gsigma1[1]; dH12[2] = lAsa1_1[2]*sigma1+dsigma1*gsigma1[2]; dH12[3] = lAsa1_1[3]*sigma1+dsigma1*gsigma1[3]; dH12[4] = lAsa1_1[4]*sigma1+dsigma1*gsigma1[4]; dH12[5] = lAsa1_1[5]*sigma1+dsigma1*gsigma1[5]; dH12[6] = lAsa1_1[6]*sigma1+dsigma1*gsigma1[6]; dH12[7] = lAsa1_1[7]*sigma1+dsigma1*gsigma1[7]; dH12[8] = lAsa1_1[8]*sigma1+dsigma1*gsigma1[8]; ddH = det_prime(H12,dH12); deta = tsig1sig2*dsigma1-tdH*ddH; deta -= teta1*dsigma1; numtyp tempv[3]; gpu_times_column3(lA1_1,w,tempv); dchi = -gpu_dot3(fwae,tempv); gpu_times_column3(lAtwo1_1,spr,tempv); dh12 = -gpu_dot3(s,tempv); dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; tor.y -= (dUa*Ua+dUr*Ur); } { gpu_times_column3(lA1_2,rhat,p); dsigma1 = gpu_dot3(p,vsigma1); dH12[0] = lAsa1_2[0]*sigma1+dsigma1*gsigma1[0]; dH12[1] = lAsa1_2[1]*sigma1+dsigma1*gsigma1[1]; dH12[2] = lAsa1_2[2]*sigma1+dsigma1*gsigma1[2]; dH12[3] = lAsa1_2[3]*sigma1+dsigma1*gsigma1[3]; dH12[4] = lAsa1_2[4]*sigma1+dsigma1*gsigma1[4]; dH12[5] = lAsa1_2[5]*sigma1+dsigma1*gsigma1[5]; dH12[6] = lAsa1_2[6]*sigma1+dsigma1*gsigma1[6]; dH12[7] = lAsa1_2[7]*sigma1+dsigma1*gsigma1[7]; dH12[8] = lAsa1_2[8]*sigma1+dsigma1*gsigma1[8]; ddH = det_prime(H12,dH12); deta = tsig1sig2*dsigma1-tdH*ddH; deta -= teta1*dsigma1; numtyp tempv[3]; gpu_times_column3(lA1_2,w,tempv); dchi = -gpu_dot3(fwae,tempv); gpu_times_column3(lAtwo1_2,spr,tempv); dh12 = -gpu_dot3(s,tempv); dUa = pbsu*(eta*dchi + deta*chi)-dh12*dspu; dUr = pbsr*(eta*dchi + deta*chi)-dh12*dspr; tor.z -= (dUa*Ua+dUr*Ur); } } // for nbor store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag, vflag,ans,engv); } // if ii }