// ************************************************************************** // pppm.cu // ------------------- // W. Michael Brown (ORNL) // // Device code for PPPM acceleration // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_preprocessor.h" #ifndef _DOUBLE_DOUBLE texture pos_tex; texture q_tex; #else texture pos_tex; texture q_tex; #endif // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error // generated at runtime with use of pppm/gpu #if (__CUDA_ARCH__ < 110) #define atomicAdd(x,y) *(x)+=0 #endif #else #define pos_tex x_ #define q_tex q_ #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable #if defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #endif // Number of threads per pencil for charge spread #define PENCIL_SIZE MEM_THREADS // Number of pencils per block for charge spread #define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE) __kernel void particle_map(const __global numtyp4 *restrict x_, const __global numtyp *restrict q_, const grdtyp delvolinv, const int nlocal, __global int *restrict counts, __global grdtyp4 *restrict ans, const grdtyp b_lo_x, const grdtyp b_lo_y, const grdtyp b_lo_z, const grdtyp delxinv, const grdtyp delyinv, const grdtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, const int max_atoms, __global int *restrict error) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; // Resequence the atom indices to avoid collisions during atomic ops int nthreads=GLOBAL_SIZE_X; ii=fast_mul(ii,PPPM_BLOCK_1D); ii-=(ii/nthreads)*(nthreads-1); int nx,ny,nz; if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) *error=1; else { delta.x=nx+(grdtyp)0.5-delta.x; delta.y=ny+(grdtyp)0.5-delta.y; delta.z=nz+(grdtyp)0.5-delta.z; int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx; int old=atom_add(counts+i, 1); if (old>=max_atoms) { *error=2; atom_add(counts+i, -1); } else ans[atom_stride*old+i]=delta; } } } } /* --------------------------- */ __kernel void make_rho(const __global int *restrict counts, const __global grdtyp4 *restrict atoms, __global grdtyp *restrict brick, const __global grdtyp *restrict _rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int order_m_1, const int order, const int order2) { __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE]; __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE]; __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D]; int tid=THREAD_ID_X; if (tid=nlocal_y) y_stop-=ny-nlocal_y+1; if (nz>=nlocal_z) z_stop-=nz-nlocal_z+1; int z_stride=fast_mul(nlocal_x,nlocal_y); int loop_count=npts_x/PENCIL_SIZE+1; int nx=fid; int pt=fast_mul(nz,fast_mul(npts_y,npts_x))+fast_mul(ny,npts_x)+nx; for (int i=0 ; i -1; k-=order) { rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y; rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z; } delta.w*=rho1d_1*rho1d_2; for (int n=0; n=n; k-=order) rho1d_0=rho_coeff[k]+rho1d_0*delta.x; ans[n][tid]+=delta.w*rho1d_0; } } y_pos+=nlocal_x; } z_pos+=z_stride; } } __syncthreads(); if (fid=k; l-=order) { rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx; rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy; } } int mz=fast_mul(nz,npts_yx)+nx; for (int n=0; n=n; k-=order) rho1d_2=rho_coeff[k]+rho1d_2*dz; grdtyp z0=qs*rho1d_2; int my=mz+fast_mul(ny,npts_x); for (int m=0; m