// ************************************************************************** // preprocessor.cu // ------------------- // W. Michael Brown (ORNL) // // Device code for CUDA-specific preprocessor definitions // // __________________________________________________________________________ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // // begin : // email : brownw@ornl.gov // ***************************************************************************/ //************************************************************************* // Preprocessor Definitions // // Note: It is assumed that constants with the same names are defined with // the same values in all files. // // ARCH // Definition: Architecture number for accelerator // MEM_THREADS // Definition: Number of threads with sequential ids accessing memory // simultaneously on multiprocessor // WARP_SIZE: // Definition: Number of threads guaranteed to be on the same instruction // THREADS_PER_ATOM // Definition: Default number of threads assigned per atom for pair styles // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // THREADS_PER_CHARGE // Definition: Default number of threads assigned per atom for pair styles // with charge // Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE // PPPM_MAX_SPLINE // Definition: Maximum order for splines in PPPM // PPPM_BLOCK_1D // Definition: Thread block size for PPPM kernels // Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // PPPM_BLOCK_1D%32==0 // BLOCK_PAIR // Definition: Default thread block size for pair styles // Restrictions: // MAX_SHARED_TYPES 8 // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR // BLOCK_CELL_2D // Definition: Default block size in each dimension for cell list builds // and matrix transpose // BLOCK_CELL_ID // Definition: Default block size for binning atoms in cell list builds // BLOCK_NBOR_BUILD // Definition: Default block size for neighbor list builds // BLOCK_BIO_PAIR // Definition: Default thread block size for "bio" pair styles // MAX_BIO_SHARED_TYPES // Definition: Max # of atom type params can be stored in shared memory // Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 // //*************************************************************************/ // ------------------------------------------------------------------------- // CUDA DEFINITIONS // ------------------------------------------------------------------------- #ifdef NV_KERNEL #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x) #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y) #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x); #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y); #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y #define BLOCK_ID_X blockIdx.x #define BLOCK_ID_Y blockIdx.y #define BLOCK_SIZE_X blockDim.x #define BLOCK_SIZE_Y blockDim.y #define __kernel extern "C" __global__ #define __local __shared__ #define __global #define restrict __restrict__ #define atom_add atomicAdd #define ucl_inline static __inline__ __device__ #ifdef __CUDA_ARCH__ #define ARCH __CUDA_ARCH__ #else #define ARCH 100 #endif #if (ARCH < 200) #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 16 #define BLOCK_NBOR_BUILD 64 #define BLOCK_PAIR 64 #define BLOCK_BIO_PAIR 64 #define MAX_SHARED_TYPES 8 #else #if (ARCH < 300) #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 128 #define BLOCK_BIO_PAIR 128 #define MAX_SHARED_TYPES 8 #else #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 #define BLOCK_PAIR 256 #define BLOCK_BIO_PAIR 256 #define BLOCK_ELLIPSE 128 #define MAX_SHARED_TYPES 11 #if (__CUDACC_VER_MAJOR__ < 9) #ifdef _SINGLE_SINGLE #define shfl_xor __shfl_xor #else ucl_inline double shfl_xor(double var, int laneMask, int width) { int2 tmp; tmp.x = __double2hiint(var); tmp.y = __double2loint(var); tmp.x = __shfl_xor(tmp.x,laneMask,width); tmp.y = __shfl_xor(tmp.y,laneMask,width); return __hiloint2double(tmp.x,tmp.y); } #endif #else #ifdef _SINGLE_SINGLE ucl_inline double shfl_xor(double var, int laneMask, int width) { return __shfl_xor_sync(0xffffffff, var, laneMask, width); } #else ucl_inline double shfl_xor(double var, int laneMask, int width) { int2 tmp; tmp.x = __double2hiint(var); tmp.y = __double2loint(var); tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width); tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width); return __hiloint2double(tmp.x,tmp.y); } #endif #endif #endif #endif #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #ifdef _DOUBLE_DOUBLE #define fetch4(ans,i,pos_tex) { \ int4 xy = tex1Dfetch(pos_tex,i*2); \ int4 zt = tex1Dfetch(pos_tex,i*2+1); \ ans.x=__hiloint2double(xy.y, xy.x); \ ans.y=__hiloint2double(xy.w, xy.z); \ ans.z=__hiloint2double(zt.y, zt.x); \ ans.w=__hiloint2double(zt.w, zt.z); \ } #define fetch(ans,i,q_tex) { \ int2 qt = tex1Dfetch(q_tex,i); \ ans=__hiloint2double(qt.y, qt.x); \ } #else #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i); #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i); #endif #if (__CUDA_ARCH__ < 200) #define fast_mul __mul24 #define MEM_THREADS 16 #else #define fast_mul(X,Y) (X)*(Y) #define MEM_THREADS 32 #endif #ifdef CUDA_PRE_THREE struct __builtin_align__(16) _double4 { double x, y, z, w; }; typedef struct _double4 double4; #endif #ifdef _DOUBLE_DOUBLE #define ucl_exp exp #define ucl_powr pow #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt #define ucl_recip(x) ((numtyp)1.0/(x)) #else #define ucl_atan atanf #define ucl_cbrt cbrtf #define ucl_ceil ceilf #define ucl_abs fabsf #define ucl_recip(x) ((numtyp)1.0/(x)) #define ucl_rsqrt rsqrtf #define ucl_sqrt sqrtf #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp expf #define ucl_powr powf #else #define ucl_exp __expf #define ucl_powr __powf #endif #endif #endif // ------------------------------------------------------------------------- // NVIDIA GENERIC OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef NV_GENERIC_OCL #define USE_OPENCL #define fast_mul mul24 #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // NVIDIA FERMI OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef FERMI_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 128 #define MAX_SHARED_TYPES 11 #define BLOCK_NBOR_BUILD 128 #define BLOCK_BIO_PAIR 128 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // NVIDIA KEPLER OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef KEPLER_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 256 #define MAX_SHARED_TYPES 11 #define BLOCK_NBOR_BUILD 128 #define BLOCK_BIO_PAIR 256 #define BLOCK_ELLIPSE 128 #define WARP_SIZE 32 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #ifndef NO_OCL_PTX #define ARCH 300 #ifdef _SINGLE_SINGLE inline float shfl_xor(float var, int laneMask, int width) { float ret; int c; c = ((WARP_SIZE-width) << 8) | 0x1f; asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c)); return ret; } #else #pragma OPENCL EXTENSION cl_khr_fp64 : enable inline double shfl_xor(double var, int laneMask, int width) { int c = ((WARP_SIZE-width) << 8) | 0x1f; int x,y,x2,y2; double ans; asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var)); asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c)); asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c)); asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2)); return ans; } #endif #endif #endif // ------------------------------------------------------------------------- // AMD CYPRESS OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef CYPRESS_OCL #define USE_OPENCL #define MEM_THREADS 32 #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_PAIR 128 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 64 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // INTEL CPU OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef INTEL_OCL #define USE_OPENCL #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 1 #define MAX_SHARED_TYPES 0 #define BLOCK_NBOR_BUILD 4 #define BLOCK_BIO_PAIR 2 #define BLOCK_ELLIPSE 2 #define WARP_SIZE 1 #define PPPM_BLOCK_1D 32 #define BLOCK_CELL_2D 1 #define BLOCK_CELL_ID 2 #define MAX_BIO_SHARED_TYPES 0 #endif // ------------------------------------------------------------------------- // INTEL PHI OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef PHI_OCL #define USE_OPENCL #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 16 #define MAX_SHARED_TYPES 0 #define BLOCK_NBOR_BUILD 16 #define BLOCK_BIO_PAIR 16 #define BLOCK_ELLIPSE 16 #define WARP_SIZE 1 #define PPPM_BLOCK_1D 32 #define BLOCK_CELL_2D 4 #define BLOCK_CELL_ID 16 #define MAX_BIO_SHARED_TYPES 0 #endif // ------------------------------------------------------------------------- // GENERIC OPENCL DEFINITIONS // ------------------------------------------------------------------------- #ifdef GENERIC_OCL #define USE_OPENCL #define MEM_THREADS 16 #define THREADS_PER_ATOM 1 #define THREADS_PER_CHARGE 1 #define BLOCK_PAIR 64 #define MAX_SHARED_TYPES 8 #define BLOCK_NBOR_BUILD 64 #define BLOCK_BIO_PAIR 64 #define WARP_SIZE 1 #define PPPM_BLOCK_1D 64 #define BLOCK_CELL_2D 8 #define BLOCK_CELL_ID 128 #define MAX_BIO_SHARED_TYPES 128 #endif // ------------------------------------------------------------------------- // OPENCL Stuff for All Hardware // ------------------------------------------------------------------------- #ifdef USE_OPENCL #ifndef _SINGLE_SINGLE #ifndef cl_khr_fp64 #ifndef cl_amd_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #endif #endif #if defined(cl_khr_fp64) #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifndef fast_mul #define fast_mul(X,Y) (X)*(Y) #endif #ifndef ARCH #define ARCH 0 #endif #ifndef DRIVER #define DRIVER 0 #endif #define GLOBAL_ID_X get_global_id(0) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) #define GLOBAL_SIZE_X get_global_size(0) #define THREAD_ID_Y get_local_id(1) #define BLOCK_ID_Y get_group_id(1) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define ucl_inline inline #define fetch4(ans,i,x) ans=x[i] #define fetch(ans,i,q) ans=q[i] #define ucl_atan atan #define ucl_cbrt cbrt #define ucl_ceil ceil #define ucl_abs fabs #ifdef _DOUBLE_DOUBLE #define NO_HARDWARE_TRANSCENDENTALS #endif #ifdef NO_HARDWARE_TRANSCENDENTALS #define ucl_exp exp #define ucl_powr powr #define ucl_rsqrt rsqrt #define ucl_sqrt sqrt #define ucl_recip(x) ((numtyp)1.0/(x)) #else #define ucl_exp native_exp #define ucl_powr native_powr #define ucl_rsqrt native_rsqrt #define ucl_sqrt native_sqrt #define ucl_recip native_recip #endif #endif // ------------------------------------------------------------------------- // ARCHITECTURE INDEPENDENT DEFINITIONS // ------------------------------------------------------------------------- #ifndef PPPM_MAX_SPLINE #define PPPM_MAX_SPLINE 8 #endif #ifdef _DOUBLE_DOUBLE #define numtyp double #define numtyp2 double2 #define numtyp4 double4 #define acctyp double #define acctyp4 double4 #endif #ifdef _SINGLE_DOUBLE #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp double #define acctyp4 double4 #endif #ifndef numtyp #define numtyp float #define numtyp2 float2 #define numtyp4 float4 #define acctyp float #define acctyp4 float4 #endif #define EWALD_F (numtyp)1.12837917 #define EWALD_P (numtyp)0.3275911 #define A1 (numtyp)0.254829592 #define A2 (numtyp)-0.284496736 #define A3 (numtyp)1.421413741 #define A4 (numtyp)-1.453152027 #define A5 (numtyp)1.061405429 #define SBBITS 30 #define NEIGHMASK 0x3FFFFFFF ucl_inline int sbmask(int j) { return j >> SBBITS & 3; }; #ifndef BLOCK_ELLIPSE #define BLOCK_ELLIPSE BLOCK_PAIR #endif // default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG) #define LAMMPS_SMALLBIG #endif