/*************************************************************************** atom.h ------------------- W. Michael Brown (ORNL) Class for particle data management __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #ifndef PAIR_GPU_ATOM_H #define PAIR_GPU_ATOM_H #include #include "mpi.h" #if defined(USE_OPENCL) #include "geryon/ocl_timer.h" #include "geryon/ocl_mat.h" #include "geryon/ocl_kernel.h" using namespace ucl_opencl; #elif defined(USE_CUDART) #include "geryon/nvc_timer.h" #include "geryon/nvc_mat.h" #include "geryon/nvc_kernel.h" using namespace ucl_cudart; #else #include "geryon/nvd_timer.h" #include "geryon/nvd_mat.h" #include "geryon/nvd_kernel.h" using namespace ucl_cudadr; #endif #ifdef USE_CUDPP #include "cudpp.h" #endif #include "lal_precision.h" namespace LAMMPS_AL { template class Atom { public: Atom(); ~Atom() { clear(); } /// Maximum number of atoms that can be stored with current allocation inline int max_atoms() const { return _max_atoms; } /// Current number of local+ghost atoms stored inline int nall() const { return _nall; } /// Set number of local+ghost atoms for future copy operations inline void nall(const int n) { _nall=n; } /// Memory usage per atom in this class int bytes_per_atom() const; /// Clear any previous data and set up for a new LAMMPS run /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool init(const int nall, const bool charge, const bool rot, UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false, const bool vel=false); /// Check if we have enough device storage and realloc if not /** Returns true if resized with any call during this timestep **/ inline bool resize(const int nall, bool &success) { _nall=nall; if (nall>_max_atoms) { clear_resize(); success = success && alloc(nall); _resized=true; } return _resized; } /// If already initialized by another LAMMPS style, add fields as necessary /** \param rot True if atom storage needs quaternions * \param gpu_nbor 0 if neighboring will be performed on host * gpu_nbor 1 if neighboring will be performed on device * gpu_nbor 2 if binning on host and neighboring on device **/ bool add_fields(const bool charge, const bool rot, const int gpu_nbor, const bool bonds, const bool vel=false); /// Returns true if GPU is using charges bool charge() { return _charge; } /// Returns true if GPU is using quaternions bool quaternion() { return _rot; } /// Returns true if GPU is using velocities bool velocity() { return _vel; } /// Only free matrices of length inum or nall for resizing void clear_resize(); /// Free all memory on host and device void clear(); /// Return the total amount of host memory used by class in bytes double host_memory_usage() const; /// Sort arrays for neighbor list calculation on device void sort_neighbor(const int num_atoms); /// Add copy times to timers inline void acc_timers() { time_pos.add_to_total(); if (_charge) time_q.add_to_total(); if (_rot) time_quat.add_to_total(); if (_vel) time_vel.add_to_total(); } /// Add copy times to timers inline void zero_timers() { time_pos.zero(); if (_charge) time_q.zero(); if (_rot) time_quat.zero(); if (_vel) time_vel.zero(); } /// Return the total time for host/device data transfer /** Zeros the total so that the atom times are only included once **/ inline double transfer_time() { double total=time_pos.total_seconds(); time_pos.zero_total(); if (_charge) { total+=time_q.total_seconds(); time_q.zero_total(); } if (_rot) { total+=time_quat.total_seconds(); time_quat.zero_total(); } if (_vel) { total+=time_vel.total_seconds(); time_vel.zero_total(); } return total+_time_transfer/1000.0; } /// Return the total time for data cast/pack /** Zeros the time so that atom times are only included once **/ inline double cast_time() { double t=_time_cast; _time_cast=0.0; return t; } /// Pack LAMMPS atom type constants into matrix and copy to device template inline void type_pack1(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one) { int ii=0; for (int i=0; i(one[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants into 2 vectors and copy to device template inline void type_pack2(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*2+1]=static_cast(two[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (3) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two, t3 **three) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*4+1]=static_cast(two[i][j]); buffer[ii*4+2]=static_cast(three[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom type constants (4) into 4 vectors and copy to device template inline void type_pack4(const int n, const int m_size, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two, t3 **three, t4 **four) { int ii=0; for (int i=0; i(one[i][j]); buffer[ii*4+1]=static_cast(two[i][j]); buffer[ii*4+2]=static_cast(three[i][j]); buffer[ii*4+3]=static_cast(four[i][j]); ii++; } ii+=m_size-n; } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev); ucl_copy(dev_v,view,false); } /// Pack LAMMPS atom "self" type constants into 2 vectors and copy to device template inline void self_pack2(const int n, UCL_D_Vec &dev_v, UCL_H_Vec &buffer, t1 **one, t2 **two) { for (int i=0; i(one[i][i]); buffer[i*2+1]=static_cast(two[i][i]); } UCL_H_Vec view; view.view((dev_typ*)buffer.begin(),n,*dev); ucl_copy(dev_v,view,false); } // -------------------------COPY TO GPU ---------------------------------- /// Signal that we need to transfer atom data for next timestep inline void data_unavail() { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; } /// Cast positions and types to write buffer inline void cast_x_data(double **host_ptr, const int *host_type) { if (_x_avail==false) { double t=MPI_Wtime(); #ifdef GPU_CAST memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); #else int wl=0; for (int i=0; i<_nall; i++) { x[wl]=host_ptr[i][0]; x[wl+1]=host_ptr[i][1]; x[wl+2]=host_ptr[i][2]; x[wl+3]=host_type[i]; wl+=4; } #endif _time_cast+=MPI_Wtime()-t; } } /// Copy positions and types to device asynchronously /** Copies nall() elements **/ inline void add_x_data(double **host_ptr, int *host_type) { time_pos.start(); if (_x_avail==false) { #ifdef GPU_CAST x_cast.update_device(_nall*3,true); type_cast.update_device(_nall,true); int block_size=64; int GX=static_cast(ceil(static_cast(_nall)/block_size)); k_cast_x.set_size(GX,block_size); k_cast_x.run(&x, &x_cast, &type_cast, &_nall); #else x.update_device(_nall*4,true); #endif _x_avail=true; } time_pos.stop(); } /// Calls cast_x_data and add_x_data and times the routines inline void cast_copy_x(double **host_ptr, int *host_type) { cast_x_data(host_ptr,host_type); add_x_data(host_ptr,host_type); } // Cast charges to write buffer template inline void cast_q_data(cpytyp *host_ptr) { if (_q_avail==false) { double t=MPI_Wtime(); // If double precision, still memcpy for async transfers if (_host_view) { q.host.view((numtyp*)host_ptr,_nall,*dev); q.device.view(q.host); } else if (sizeof(numtyp)==sizeof(double)) memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp)); else for (int i=0; i<_nall; i++) q[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } } // Copy charges to device asynchronously inline void add_q_data() { time_q.start(); if (_q_avail==false) { q.update_device(_nall,true); _q_avail=true; } time_q.stop(); } // Cast quaternions to write buffer template inline void cast_quat_data(cpytyp *host_ptr) { if (_quat_avail==false) { double t=MPI_Wtime(); if (_host_view) { quat.host.view((numtyp*)host_ptr,_nall*4,*dev); quat.device.view(quat.host); } else if (sizeof(numtyp)==sizeof(double)) memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); else for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; _time_cast+=MPI_Wtime()-t; } } // Copy quaternions to device /** Copies nall()*4 elements **/ inline void add_quat_data() { time_quat.start(); if (_quat_avail==false) { quat.update_device(_nall*4,true); _quat_avail=true; } time_quat.stop(); } /// Cast velocities and tags to write buffer inline void cast_v_data(double **host_ptr, const tagint *host_tag) { if (_v_avail==false) { double t=MPI_Wtime(); #ifdef GPU_CAST memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int)); #else int wl=0; for (int i=0; i<_nall; i++) { v[wl]=host_ptr[i][0]; v[wl+1]=host_ptr[i][1]; v[wl+2]=host_ptr[i][2]; v[wl+3]=host_tag[i]; wl+=4; } #endif _time_cast+=MPI_Wtime()-t; } } /// Copy velocities and tags to device asynchronously /** Copies nall() elements **/ inline void add_v_data(double **host_ptr, tagint *host_tag) { time_vel.start(); if (_v_avail==false) { #ifdef GPU_CAST v_cast.update_device(_nall*3,true); tag_cast.update_device(_nall,true); int block_size=64; int GX=static_cast(ceil(static_cast(_nall)/block_size)); k_cast_x.set_size(GX,block_size); k_cast_x.run(&v, &v_cast, &tag_cast, &_nall); #else v.update_device(_nall*4,true); #endif _v_avail=true; } time_vel.stop(); } /// Calls cast_v_data and add_v_data and times the routines inline void cast_copy_v(double **host_ptr, tagint *host_tag) { cast_v_data(host_ptr,host_tag); add_v_data(host_ptr,host_tag); } /// Add in casting time from additional data (seconds) inline void add_cast_time(double t) { _time_cast+=t; } /// Add in transfer time from additional data (ms) inline void add_transfer_time(double t) { _time_transfer+=t; } /// Return number of bytes used on device inline double max_gpu_bytes() { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } /// Returns true if the device is addressing memory on the host inline bool host_view() { return _host_view; } // ------------------------------ DATA ---------------------------------- /// Atom coordinates and types ([0] is x, [1] is y, [2] is z, [3] is type UCL_Vector x; /// Charges UCL_Vector q; /// Quaterions UCL_Vector quat; /// Velocities UCL_Vector v; #ifdef GPU_CAST UCL_Vector x_cast; UCL_Vector type_cast; #endif /// Cell list identifiers for device nbor builds UCL_D_Vec dev_cell_id; /// Cell list identifiers for device nbor builds UCL_D_Vec dev_particle_id; /// Atom tag information for device nbor builds UCL_D_Vec dev_tag; /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_cell_id; /// Cell list identifiers for hybrid nbor builds UCL_H_Vec host_particle_id; /// Device timers UCL_Timer time_pos, time_q, time_quat, time_vel; /// Geryon device UCL_Device *dev; private: #ifdef GPU_CAST UCL_Program *atom_program; UCL_Kernel k_cast_x; void compile_kernels(UCL_Device &dev); #endif bool _compiled; // True if data has been copied to device already bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized; bool alloc(const int nall); bool _allocated, _rot, _charge, _bonds, _vel, _other; int _max_atoms, _nall, _gpu_nbor; bool _host_view; double _time_cast, _time_transfer; double _max_gpu_bytes; #ifdef USE_CUDPP CUDPPConfiguration sort_config; CUDPPHandle sort_plan; #endif }; } #endif