// ************************************************************************** // neighbor_gpu.cu // ------------------- // Peng Wang (Nvidia) // W. Michael Brown (ORNL) // // Device code for handling GPU generated neighbor lists // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : penwang@nvidia.com, brownw@ornl.gov // ***************************************************************************/ #ifdef NV_KERNEL #include "lal_preprocessor.h" #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #include "inttypes.h" #define tagint int64_t #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #ifndef _DOUBLE_DOUBLE texture pos_tex; #else texture pos_tex; #endif __kernel void calc_cell_id(const numtyp4 *restrict pos, unsigned *restrict cell_id, int *restrict particle_id, numtyp boxlo0, numtyp boxlo1, numtyp boxlo2, numtyp i_cell_size, int ncellx, int ncelly, int ncellz, int inum, int nall, int cells_in_cutoff) { int i = threadIdx.x + blockIdx.x*blockDim.x; if (i < nall) { numtyp4 p; fetch4(p,i,pos_tex); //pos[i]; p.x -= boxlo0; p.y -= boxlo1; p.z -= boxlo2; int ix = int(p.x*i_cell_size+cells_in_cutoff); int iy = int(p.y*i_cell_size+cells_in_cutoff); int iz = int(p.z*i_cell_size+cells_in_cutoff); int offset_lo, offset_hi; if (i 0 && idx < nall) { int id_l = cell_id[idx-1]; if (id != id_l) { for (int i = id_l+1; i <= id; i++) cell_counts[i] = idx; } } } } #else #define pos_tex x_ #ifdef LAMMPS_SMALLBIG #define tagint int #endif #ifdef LAMMPS_BIGBIG #define tagint long long int #endif #ifdef LAMMPS_SMALLSMALL #define tagint int #endif #endif __kernel void transpose(__global tagint *restrict out, const __global tagint *restrict in, int columns_in, int rows_in) { __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; unsigned i=bi*BLOCK_CELL_2D+ti; unsigned j=bj*BLOCK_CELL_2D+tj; if ((i 1e-5 cnt++; if (cnt <= neigh_bin_size) { *neigh_list = pid_j; neigh_list++; if ((cnt & (t_per_atom-1))==0) neigh_list=neigh_list+stride; } } } } __syncthreads(); } // for (k) } } } if (pid_i < nt) *neigh_counts = cnt; } // for (i) } __kernel void kernel_special(__global int *dev_nbor, __global int *host_nbor_list, const __global int *host_numj, const __global tagint *restrict tag, const __global int *restrict nspecial, const __global tagint *restrict special, int inum, int nt, int max_nbors, int t_per_atom) { int tid=THREAD_ID_X; int ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom); ii+=tid/t_per_atom; int offset=tid & (t_per_atom-1); if (ii=n1) which++; if (i>=n2) which++; nbor=nbor ^ (which << SBBITS); *list=nbor; } offset+=nt; } } } // if ii }