#ifndef __UTILITIES_H__ #define __UTILITIES_H__ #include #include #include #include #include #include #include #include "cublas_v2.h" #include "cusparse_v2.h" #include #define PI 3.141592653589793 #define RADIAN2ARCSEC (180*3600/PI) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define CUBLAS_VERBOSE 0 #define RAND_SEED 2013 #define N_THREAD 16 #define N_THREAD2 256 #define SQRT_DBL_MAX 1.3407807929942596e+154 //#define M_PI 3.14159265358979323846264338328 /* pi */ #define DBL_EPSILON 2.2204460492503131e-16 static void Error( const char *errMsg, const char *file, int line ) { fprintf(stderr,"\n\x1B[31m@(CEO)>ERROR: %s in %s at line %d\x1B[0m\n", errMsg, file, line ); exit( EXIT_FAILURE ); } #define ERROR( errMsg ) (Error( errMsg, __FILE__, __LINE__ )) #ifdef SILENT #define INFO( infoMsg, args... ); #else #define INFO( infoMsg , args...) fprintf(stdout,infoMsg, ##args); #endif static void HandleError( cudaError_t err, const char *file, int line ) { if (err != cudaSuccess) { fprintf(stderr,"\n\x1B[31m@(CEO)>ERROR: %s in %s at line %d\x1B[0m\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) static void HandleErrorCUFFT( cufftResult_t err, const char *msg, const char *file, int line ) { if (err != CUFFT_SUCCESS) { fprintf(stderr,"\n\x1B[31m@(CEO)>CUFFT ERROR: %s in %s at line %d\x1B[0m\n", msg, file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR_CUFFT( err, msg ) (HandleErrorCUFFT( err, msg, __FILE__, __LINE__ )) static void HandleErrorCUSPARSE( cusparseStatus_t err, const char *msg, const char *file, int line ) { if (err != CUSPARSE_STATUS_SUCCESS) { fprintf(stderr,"\n\x1B[31m@(CEO)>CUSPARSE ERROR: %s in %s at line %d\x1B[0m\n", msg, file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR_CUSPARSE( err, msg ) (HandleErrorCUSPARSE( err, msg, __FILE__, __LINE__ )) static void HandleErrorCURAND( curandStatus_t err, const char *msg, const char *file, int line ) { if (err != CURAND_STATUS_SUCCESS) { fprintf(stderr,"\n\x1B[31m@(CEO)>CURAND ERROR: %s in %s at line %d\x1B[0m\n", msg, file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR_CUSOLVER( err ) (HandleErrorCUSOLVER( err, __FILE__, __LINE__ )) static void HandleErrorCUSOLVER( cusolverStatus_t err, const char *file, int line ) { if (err != CUSOLVER_STATUS_SUCCESS) { if (err==CUSOLVER_STATUS_NOT_INITIALIZED) fprintf(stderr,"\n\x1B[31m@(CEO)>CUSOLVER ERROR: %s in %s at line %d:.\x1B[0m\n", "the library was not initialized", file, line ); else if (err==CUSOLVER_STATUS_INVALID_VALUE) fprintf(stderr,"\n\x1B[31m@(CEO)>CUSOLVER ERROR: %s in %s at line %d\x1B[0m\n", "invalid parameters were passed (m,n<0 or ldaCUSOLVER ERROR: %s in %s at line %d\x1B[0m\n", "the device only supports compute capability 2.0 and above", file, line ); else fprintf(stderr,"\n\x1B[31m@(CEO)>CURAND ERROR: %s in %s at line %d\x1B[0m\n", "an internal operation failed.", file, line ); exit( EXIT_FAILURE ); } } #define HANDLE_ERROR_CURAND( err, msg ) (HandleErrorCURAND( err, msg, __FILE__, __LINE__ )) inline float sign(float x) { return (float) ( (x > 0) - (x < 0) ); } int round_up_to_nhp2(int n); void set_device(int id); int get_device_count(); #define ARCSEC(a) (a*PI/180/3600) typedef double rtd; struct vector{ rtd x; rtd y; rtd z; __host__ __device__ vector(rtd x=0.0, rtd y=0.0, rtd z=0.0) : x(x), y(y), z(z) { } __host__ __device__ rtd rho2(void); __host__ __device__ rtd rho2_shift(rtd x0, rtd y0); __host__ __device__ rtd mag(void); __host__ __device__ rtd mag(rtd R); __host__ __device__ rtd angle(void); __host__ __device__ rtd norm(void); __host__ __device__ rtd dot(vector *u); __host__ __device__ void unit(void); __host__ __device__ void left_cross(vector *w, vector *u); __host__ __device__ void right_cross(vector *w, vector *u); __host__ __device__ vector& operator=(const vector& v){ x = v.x; y = v.y; z = v.z; return *this; } __host__ __device__ vector& operator-=(const vector& rhs){ x -= rhs.x; y -= rhs.y; z -= rhs.z; return *this; } __host__ __device__ vector& operator+=(const vector& rhs){ x += rhs.x; y += rhs.y; z += rhs.z; return *this; } __host__ __device__ vector operator-(const vector& rhs) const { return vector(x-rhs.x,y-rhs.y,z-rhs.z); } __host__ __device__ vector operator+(const vector& rhs) const { return vector(x+rhs.x,y+rhs.y,z+rhs.z); } __host__ __device__ rtd operator*(const vector& rhs) const { return x*rhs.x + y*rhs.y + z*rhs.z; } __host__ __device__ vector operator*(const rtd& rhs) const { return vector(x*rhs,y*rhs,z*rhs); } }; __device__ inline int lenslet2array(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet, int i_lenslet, int j_lenslet, int n_lenslet, int i_source) { int index; index = i_lenslet*n_px_lenslet + i_px_lenslet; index += i_source*n_px_lenslet*n_lenslet; index *= n_lenslet*n_px_lenslet; index += j_lenslet*n_px_lenslet + j_px_lenslet; return index; } __device__ inline int lenslet2array_overlap(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet, int i_lenslet, int j_lenslet, int n_lenslet, int i_source) { int index; index = i_lenslet*(n_px_lenslet-1) + i_px_lenslet; index += i_source*((n_px_lenslet-1)*n_lenslet+1); index *= n_lenslet*(n_px_lenslet-1)+1; index += j_lenslet*(n_px_lenslet-1) + j_px_lenslet; return index; } __device__ inline int lenslet2row(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet, int i_lenslet, int j_lenslet, int n_lenslet, int i_source) { int index; index = i_lenslet*n_lenslet + j_lenslet; index += i_source*n_lenslet*n_lenslet; index *= n_px_lenslet; index += i_px_lenslet; index *= n_px_lenslet; index += j_px_lenslet; return index; } __device__ inline char threads2lenslet(dim3 threadIdx, dim3 blockIdx, dim3 blockDim, int *i_px_lenslet, int *j_px_lenslet, int n_px_lenslet, int *i_lenslet, int *j_lenslet, int n_lenslet) { *i_px_lenslet = threadIdx.x + blockIdx.x * blockDim.x; *j_px_lenslet = threadIdx.y + blockIdx.y * blockDim.y; *i_lenslet = *i_px_lenslet/n_px_lenslet; if (*i_lenslet>=n_lenslet) return 0; *j_lenslet = *j_px_lenslet/n_px_lenslet; if (*j_lenslet>=n_lenslet) return 0; *i_px_lenslet -= *i_lenslet*n_px_lenslet; *j_px_lenslet -= *j_lenslet*n_px_lenslet; return 1; } #ifdef SILENT struct stopwatch{ float elapsedTime; void tic(void) { elapsedTime=0.0; } void toc(void) {} void toc(const char *message) {} void toc(float *elapsedTime, const char *message) {} void toc(float *elapsedTime) {} }; #else struct stopwatch{ float elapsedTime; cudaEvent_t start, stop; void tic(void) { HANDLE_ERROR( cudaEventCreate( &start ) ); HANDLE_ERROR( cudaEventCreate( &stop ) ); HANDLE_ERROR( cudaEventRecord( start, 0 ) ); } void toc(void) { HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf("\x1B[33mElapsed time: %8.2E ms\x1B[0m\n", elapsedTime ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); } void toc(const char *message) { HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf("\x1B[33m%s: Elapsed time: %8.2E ms\x1B[0m\n", message, elapsedTime ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); } void toc(float *elapsedTime, const char *message) { HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); HANDLE_ERROR( cudaEventElapsedTime( elapsedTime, start, stop ) ); printf("\x1B[33m%s: Elapsed time: %8.2E ms\x1B[0m\n", message, *elapsedTime ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); } void toc(float *elapsedTime) { HANDLE_ERROR( cudaEventRecord( stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( stop ) ); HANDLE_ERROR( cudaEventElapsedTime( elapsedTime, start, stop ) ); HANDLE_ERROR( cudaEventDestroy( start ) ); HANDLE_ERROR( cudaEventDestroy( stop ) ); } }; #endif void CUBLAS_ERROR(cublasStatus_t status); struct mask { char *m; float *f; int *idx; int size_px[2]; int nel; float nnz; float size_m[2]; float area; float delta[2]; cublasHandle_t handle; int *d__piston_mask; void setup(int n); void setup(int n, float L); void setup(int n, float L, int i_s, int j_s, int n_out); void setup(float n, float L, float i_0, float j_0, float theta, float i_s, float j_s, int n_out ); void setup_circular(int n); void setup_circular(int n, float D); void setup_circular(int n, float D, float scale); void setup_GMT(int n, float S); void set_filter(void); void set_filter_quiet(void); void set_index(void); void set_gmt_piston(float *phase, float *d__p); void alter(float *filter); void add(mask *other); void add(char *other, int other_nel); void add(mask *other, int offset); void add(char *other, int other_nel, int offset); void reset(void); void cleanup(void); }; struct stats { cublasHandle_t handle; cublasStatus_t status; void setup(void); void cleanup(void); float mean(const float *data, int n_data); float mean(const float *data, mask *M, int n_data); float var(const float *data, int n_data); float std(const float *data, int n_data); float diff_var(const float *data_1, const float *data_2, int n_data); float diff_std(const float *data_1, const float *data_2, int n_data); float var(const float *data1, mask *M, int n_data); float std(const float *data1, mask *M, int n_data); float diff_var(const float *data1, const float *data2, mask *M, int n_data); float diff_std(const float *data1, const float *data2, mask *M, int n_data); }; __device__ inline int lenset2actuator(int iL, int jL, int NL, int io, int jo) { int iL_p, jL_p, iL_pp, jL_pp, iA, jA; iL_p = 2*iL + 1; jL_p = 2*jL + 1; iL_pp = iL_p + io; jL_pp = jL_p + jo; iA = iL_pp/2; jA = jL_pp/2; return iA*(NL+1) + jA; } __global__ void fill_ones_char(char *ones, int n_data) ; __global__ void circular_pupil(char *pupil, int N, float scale); __global__ void square_pupil(char *pupil, int N, int N_S, int I_S, int J_S); __global__ void rot_square_pupil(char *pupil, int N, float N_S, float I_0, float J_0, float theta, float I_S, float J_S); __global__ void GMT_pupil(char *pupil, int *piston_mask , const int N, const float S, float Dsh, const float c, const char v); //__global__ void valid_lenslet(char *mask, int N_pupil, float threshold); __global__ void fried_geometry(char *lenslet_mask, char *actuator_mask, int NL, int n, float threshold); __global__ void fried_geometry_vs_pupil(char *lenslet_mask, char *actuator_mask, const int NL, const int n, const float threshold, const char *pupil); __global__ void wavefront_finite_difference_kernel(float *sx, float *sy, int NL, float *phi, int n, float d); __global__ void wavefront_finite_difference_kernel_masked( float *sx, float *sy, int NL, float *phi, int n, float d, char *M); __global__ void wavefront_finite_difference_sparse_matrix_kernel( float *csrValH, int *csrColIndH, int *csrRowPtrH, const int NL, const int n, const float d); __global__ void set_gmt_piston_kernel(float *phase, float *p, char *m, int *piston_mask, const int N); __global__ void double2float_kern(float *d__single_data, double *d__double_data, const int N); char fried_geometry_setup(char *lenslet_mask, char *actuator_mask, int NL, int n, float threshold); char fried_geometry_setup_vs_pupil(char *lenslet_mask, char *actuator_mask, int NL, int n, float threshold, char *pupil); void dev2file(const char* filename, float* d__data, int n_data); void dev2file(const char* filename, float* d__data, int n_data, int n_data_page, int n_page); void dev2file(const char* filename, float* d__data, int n_data, int *n_data_page, int n_page); void dev2file(const char* filename, int* d__data, int n_data); void dev2file(const char* filename, float2* d__data, int n_data); void dev2file(const char* filename, char* d__data, int n_data); double atmosphere_refractive_index(float wavelength, float altitude, float temperature, float humidity); double atmospheric_dispersion(float wavelength, float delta_wavelength, float zenith, float altitude, float temperature, float humidity); void dev2host( float *host_data, float *dev_data, int N); void dev2host_int( int *host_data, int *dev_data, int N); void host2dev( float *dev_data, float *host_data, int N); void host2dev_char( char *dev_data, char *host_data, int N); void freedev( float **dev_data ); //template struct gpu_float { float *dev_data, *host_data, *d_tau; int N, nb; stats S; cublasStatus_t stat; cublasHandle_t handle; cusolverDnHandle_t cusolverH; void setup(void); void setup(int N_T); void dev_malloc(void); void free_dev(void); void dev2host(void); void host2dev(void); void reset(void); void qr(int m); void dev2dev(gpu_float *other); void axpy(const gpu_float *x, const float alpha); void scale(const float alpha); void mv(gpu_float *y, const gpu_float *x); void qr_solve(gpu_float *x, gpu_float *b); }; struct gpu_double { double *dev_data, *host_data; int N, nb; stats S; cublasStatus_t stat; cublasHandle_t handle; cusolverDnHandle_t cusolverH; void setup(void); void setup(int N_T); void dev_malloc(void); void free_dev(void); void dev2host(void); void host2dev(void); void reset(void); void qr(int m); void mv(gpu_float *y, const gpu_float *x); }; void geqrf(float *d_tau, float *a, int m, int n); void ormqr(float *b, int m, float *q, float *tau, int n); #if CUDA_VERSION < 11000 struct sparseMatrix { int n_row, n_col; int nnz; float *csrValH; int *csrColIndH; int *csrRowPtrH; float alpha, beta; cudaError_t cudaStat; cusparseStatus_t status; cusparseHandle_t handle; cusparseMatDescr_t descr; void setup(const int _n_row_, const int _n_col_, const int _nnz_); void gradient( int NL, int N_LENSLET_PX, float d); void interpolation(int NI, int NP); void interpolation(int NI, int NP, mask *pupil,float i0, float j0); void cleanup(void); void MVM(float *y, float *x); void add(sparseMatrix *C, sparseMatrix *B); }; #endif void wavefront_finite_difference(float *sx, float *sy, int NL, float *phi, int n, float d, int N_GS); void wavefront_finite_difference(float *sx, float *sy, int NL, float *phi, int n, float d, mask *valid_lenset, int N_GS); struct cheb_series_struct { double * c; /* coefficients */ int order; /* order of expansion */ double a; /* lower interval point */ double b; /* upper interval point */ int order_sp; /* effective single precision order */ }; typedef struct cheb_series_struct cheb_series; __host__ __device__ void cheb_eval_e(const cheb_series * cs, const double x, double * result); __host__ __device__ void temme_gamma(const double nu, double * g_1pnu, double * g_1mnu, double * g1, double * g2); __host__ __device__ void K_scaled_temme(const double nu, const double x, double * K_nu, double * K_nup1, double * Kp_nu); __host__ __device__ void K_scaled_steed_temme_CF2(const double nu, const double x, double * K_nu, double * K_nup1, double * Kp_nu); __host__ __device__ double _K_nu_(const double nu, const double x); __host__ __device__ int polywind(double Px, double Py, double *Vx, double *Vy, int NV); void polywinds(int *W, double *Px, double *Py, int NP, double *Vx, double *Vy, int NV); #endif // __UTILITIES_H__