% -*- mode: Noweb; noweb-code-mode: c-mode -*- @ \index{centroiding} \section{The files} \subsection{Header} <>= #ifndef __CENTROIDING_H__ #define __CENTROIDING_H__ #ifndef __UTILITIES_H__ #include "utilities.h" #endif #ifndef __IMAGING_H__ #include "imaging.h" #endif struct centroiding { <> void setup(int _N_SIDE_LENSLET, int _N_SOURCE); void cleanup(void); void get_data(imaging *wfs); void get_data(float *frame, int N); void get_data(float *frame, int N, float cx0, float cy0, float units); void get_data(float *frame, int N, float *cx0, float *cy0, float units, char *valid_lenset); void show_centroids(char *filename); void show_flux(char *filename); void fried_geometry(mask *dm, mask *pupil, int n, float threshold); void filter(float *F); void noisify(float stddev); void reset(void); }; #endif // __CENTROIDING_H__ @ \subsection{Source} <>= #include "centroiding.h" <> <> <> <> <> <> <> <> <> <> <> <> <> <> @ \section{Parameters} \label{sec:prms} \index{centroiding!centroiding} The parameters are the centroid vector [[d__c]] with $2\times[[n_data]]$ elements. The first half contains the X axis centroids [[d__cx]] and the other half contains the Y axis centroids [[d__cy]]. The lenslet intensity is stored in the vector [[d__mass]] of length [[n_data]]. <>= int _N_SIDE_LENSLET_, N_LENSLET, N_SOURCE; float *d__c, *d__cx, *d__cy, *d__mass; char *lenslet_mask, MASK_SET; int n_data, DEV_SHARED_MEM, DEV_MAX_THREADS; cublasHandle_t handle; cublasStatus_t status; @ \section{Functions} \label{sec:functions} \subsection{Setup \& Cleanup} \label{sec:setup--cleanup} Parameters are allocated with the [[setup]] routines \index{centroiding!centroiding!setup} <>= void centroiding::setup(int _N_SIDE_LENSLET, int _N_SOURCE) { _N_SIDE_LENSLET_ = _N_SIDE_LENSLET; N_LENSLET = _N_SIDE_LENSLET_*_N_SIDE_LENSLET_; N_SOURCE = _N_SOURCE; HANDLE_ERROR( cudaMalloc( (void**)&d__c , sizeof(float)*N_LENSLET*N_SOURCE*2 ) ); HANDLE_ERROR( cudaMemset( d__c, 0, sizeof(float)*N_LENSLET*N_SOURCE*2 ) ); d__cx = d__c; d__cy = d__c + N_LENSLET; HANDLE_ERROR( cudaMalloc( (void**)&d__mass , sizeof(float) *N_LENSLET*N_SOURCE ) ); HANDLE_ERROR( cudaMalloc( (void**)&lenslet_mask , sizeof(char) *N_LENSLET ) ); MASK_SET = 0; <> } @ and de--allocated with the [[cleanup]] routine \index{centroiding!centroiding!cleanup} <>= void centroiding::cleanup(void) { INFO("@(CEO)>centroiding: freeing memory!\n"); HANDLE_ERROR( cudaFree( d__c ) ); HANDLE_ERROR( cudaFree( d__mass ) ); if (lenslet_mask) { HANDLE_ERROR( cudaFree( lenslet_mask ) ); } } @ The centroiding device kernel is allocating shared memory. One needs to check that the required shared memory does not exceed the maximum shared memory per block. <>= cudaDeviceProp dev_prop; int dev_id; HANDLE_ERROR( cudaGetDevice( &dev_id ) ); HANDLE_ERROR( cudaGetDeviceProperties( &dev_prop, dev_id ) ); DEV_SHARED_MEM = dev_prop.sharedMemPerBlock; DEV_MAX_THREADS = dev_prop.maxThreadsPerBlock; //printf("Maximum shared memory: %d\n",DEV_SHARED_MEM); //printf("Maximum threads per block: %d\n",DEV_MAX_THREADS); @ The valid lenslets mask is computed for a given pupil mask sampled with $N=[[N_SIDE_LENSLET]]\times n$ pixel across with $n$ the number of pixel per lenslet: \index{centroiding!centroiding!fried\_geometry} <>= void centroiding::fried_geometry(mask *dm, mask *pupil, int n, float threshold) { MASK_SET = fried_geometry_setup_vs_pupil(lenslet_mask, dm->m, _N_SIDE_LENSLET_, n, threshold, pupil->m); printf("Setting filter!\n"); dm->set_filter(); dm->area = pupil->area; } @ The centroids are reset to 0 with \index{centroiding!centroiding!reset} <>= void centroiding::reset(void) { HANDLE_ERROR( cudaMemset( d__c, 0, sizeof(float)*N_LENSLET*N_SOURCE*2 ) ); } @ \subsection{Data processing} \label{sec:data-processing} The framelets are processed to retrieve centroids and flux \index{centroiding!centroiding!get\_data} <>= void centroiding::get_data(imaging *wfs) { int N_PX_CAMERA, N_PX_CAMERA_BYTE; float cx0, cy0, units, *frame; cx0 = cy0 = 0.0; units = 1.0;\ frame = wfs->d__frame; N_PX_CAMERA = wfs->N_PX_CAMERA; N_PX_CAMERA_BYTE = sizeof(float)*N_PX_CAMERA*N_PX_CAMERA*3; <> } @ \index{centroiding!centroiding!get\_data} <>= void centroiding::get_data(float *frame, int N_PX_CAMERA) { int N_PX_CAMERA_BYTE = sizeof(float)*N_PX_CAMERA*N_PX_CAMERA*3; float cx0, cy0, units; cx0 = cy0 = 0.0; units = 1.0; <> } @ \index{centroiding!centroiding!get\_data} <>= void centroiding::get_data(float *frame, int N_PX_CAMERA, float cx0, float cy0, float units) { int N_PX_CAMERA_BYTE = sizeof(float)*N_PX_CAMERA*N_PX_CAMERA*3; <> } @ \index{centroiding!centroiding!get\_data} <>= void centroiding::get_data(float *frame, int N_PX_CAMERA, float *cx0, float *cy0, float units, char *valid_lenslet) { int N_PX_CAMERA_BYTE = sizeof(float)*N_PX_CAMERA*N_PX_CAMERA*3; <> } @ <>= if (N_PX_CAMERA>DEV_MAX_THREADS) { fprintf(stdout,"\n\x1B[31m@(CEO)>centroiding: The required number thread (%d) is larger\n that the maximum number of threads per block (%d)\x1B[0m\n",N_PX_CAMERA,DEV_MAX_THREADS); exit( EXIT_FAILURE ); } //printf("Required shared memory: %d\n",N_PX_CAMERA_BYTE); dim3 blockGrid(_N_SIDE_LENSLET_,_N_SIDE_LENSLET_,N_SOURCE); dim3 threadGrid(N_PX_CAMERA,1); if (N_PX_CAMERA_BYTE>DEV_SHARED_MEM) { // fprintf(stdout,"\n\x1B[31m@(CEO)>centroiding: The required shared memory (%dbytes) is larger\n that the maximum shared memory per block (%dbytes)\x1B[0m\n",N_PX_CAMERA_BYTE,DEV_SHARED_MEM); float *shared; HANDLE_ERROR( cudaMalloc( (void**)&shared , N_PX_CAMERA_BYTE ) ); centroidingEngineSh LLL blockGrid , threadGrid RRR (d__cx, d__cy, d__mass, frame, cx0, cy0, units, _N_SIDE_LENSLET_, N_SOURCE, N_PX_CAMERA, shared); HANDLE_ERROR( cudaFree( shared )); } else { centroidingEngine LLL blockGrid , threadGrid , N_PX_CAMERA_BYTE RRR (d__cx, d__cy, d__mass, frame, cx0, cy0, units, _N_SIDE_LENSLET_, N_SOURCE, N_PX_CAMERA); } @ <>= if (N_PX_CAMERA>DEV_MAX_THREADS) { fprintf(stdout,"\n\x1B[31m@(CEO)>centroiding: The required number thread (%d) is larger\n that the maximum number of threads per block (%d)\x1B[0m\n",N_PX_CAMERA,DEV_MAX_THREADS); exit( EXIT_FAILURE ); } //printf("Required shared memory: %d\n",N_PX_CAMERA_BYTE); dim3 blockGrid(_N_SIDE_LENSLET_,_N_SIDE_LENSLET_,N_SOURCE); dim3 threadGrid(N_PX_CAMERA,1); if (N_PX_CAMERA_BYTE>DEV_SHARED_MEM) { // fprintf(stdout,"\n\x1B[31m@(CEO)>centroiding: The required shared memory (%dbytes) is larger\n that the maximum shared memory per block (%dbytes)\x1B[0m\n",N_PX_CAMERA_BYTE,DEV_SHARED_MEM); float *shared; HANDLE_ERROR( cudaMalloc( (void**)&shared , N_PX_CAMERA_BYTE ) ); centroidingEngineGTSh LLL blockGrid , threadGrid RRR (d__cx, d__cy, d__mass, frame, cx0, cy0, units, _N_SIDE_LENSLET_, N_SOURCE, N_PX_CAMERA, valid_lenslet, shared); HANDLE_ERROR( cudaFree( shared )); } else { centroidingEngineGT LLL blockGrid , threadGrid , N_PX_CAMERA_BYTE RRR (d__cx, d__cy, d__mass, frame, cx0, cy0, units, _N_SIDE_LENSLET_, N_SOURCE, N_PX_CAMERA, valid_lenslet); } @ The framelets are processed with the kernel: <>= __global__ void centroidingEngine(float *cx, float *cy, float *flux, const float *frame, float cx0, float cy0, float units, int _N_SIDE_LENSLET_, int N_SOURCE, int N_PX_CAMERA) { <> extern __shared__ float shared[]; <> // CENTROIDING <> __syncthreads(); if (u<1) { <> if (flux[kp]>0) // NORMALIZATION { cx[k] /= flux[kp]; cy[k] /= flux[kp]; } else { cx[k] = cy[k] = 0.5*(N_PX_CAMERA-1); } cx[k] -= cx0; cx[k] *= units; cy[k] -= cy0; cy[k] *= units; } } <>= __global__ void centroidingEngineSh(float *cx, float *cy, float *flux, const float *frame, float cx0, float cy0, float units, int _N_SIDE_LENSLET_, int N_SOURCE, int N_PX_CAMERA, float *shared) { <> <> // CENTROIDING <> __syncthreads(); if (u<1) { <> if (flux[kp]>0) // NORMALIZATION { cx[k] /= flux[kp]; cy[k] /= flux[kp]; } else { cx[k] = cy[k] = 0.5*(N_PX_CAMERA-1); } cx[k] -= cx0; cx[k] *= units; cy[k] -= cy0; cy[k] *= units; } } <>= __global__ void centroidingEngineGT(float *cx, float *cy, float *flux, const float *frame, float *cx0, float *cy0, float units, int _N_SIDE_LENSLET_, int N_SOURCE, int N_PX_CAMERA, char *valid_lenslet) { <> int kLenslet; kLenslet = (iLenslet + iSource*_N_SIDE_LENSLET_)*_N_SIDE_LENSLET_ + jLenslet; if (valid_lenslet[kLenslet]) { extern __shared__ float shared[]; <> // CENTROIDING <> __syncthreads(); if (u<1) { <> if (flux[kp]>0) // NORMALIZATION { cx[k] /= flux[kp]; cy[k] /= flux[kp]; } else { cx[k] = cy[k] = 0.5*(N_PX_CAMERA-1); } cx[k] -= cx0[k]; cx[k] *= units; cy[k] -= cy0[k]; cy[k] *= units; } } } <>= __global__ void centroidingEngineGTSh(float *cx, float *cy, float *flux, const float *frame, float *cx0, float *cy0, float units, int _N_SIDE_LENSLET_, int N_SOURCE, int N_PX_CAMERA, char *valid_lenslet, float *shared) { <> int kLenslet; kLenslet = (iLenslet + iSource*_N_SIDE_LENSLET_)*_N_SIDE_LENSLET_ + jLenslet; if (valid_lenslet[kLenslet]) { <> // CENTROIDING <> __syncthreads(); if (u<1) { <> if (flux[kp]>0) // NORMALIZATION { cx[k] /= flux[kp]; cy[k] /= flux[kp]; } else { cx[k] = cy[k] = 0.5*(N_PX_CAMERA-1); } cx[k] -= cx0[k]; cx[k] *= units; cy[k] -= cy0[k]; cy[k] *= units; } } } @ The centroiding engine is performing the following tasks: \begin{enumerate} \item variables definition and allocation <>= int u, k, kp, i, ij, ij_inc, iLenslet,jLenslet, N_PX_CAMERA2, iSource, kSource; float *buffer0, *buffer1, *buffer2; u = threadIdx.x; iLenslet = blockIdx.x; jLenslet = blockIdx.y; iSource = blockIdx.z; @ \item shared memory allocation <>= N_PX_CAMERA2 = N_PX_CAMERA*N_PX_CAMERA; buffer0 = shared; buffer1 = shared + N_PX_CAMERA2; buffer2 = shared + N_PX_CAMERA2*2; @ \item summing pixels along rows <>= if (u>= k = kp = blockIdx.x*_N_SIDE_LENSLET_ + blockIdx.y; kSource = iSource*_N_SIDE_LENSLET_*_N_SIDE_LENSLET_; k += 2*kSource; kp += kSource; flux[kp] = cx[k] = cy[k] = 0.0; for (i=0;i>= void centroiding::filter(float *F) { int k, n_byte, N; float alpha, beta; float *c; n_byte = sizeof(float)*N_LENSLET*N_SOURCE*2; HANDLE_ERROR( cudaMalloc( (void**)&c , n_byte ) ); HANDLE_ERROR( cudaMemcpy( c, d__c, n_byte, cudaMemcpyDeviceToDevice) ); cublasCreate(&handle); alpha = 1; beta = 0; N = N_LENSLET*2; for (k=0;k>= void centroiding::noisify(float stddev) { cublasHandle_t handle; curandGenerator_t gen; float *d__n, alpha; int N; alpha = 1.0; N = N_LENSLET*N_SOURCE*2; HANDLE_ERROR( cudaMalloc( (void**)&d__n , sizeof(float)*N ) ); cublasCreate(&handle); HANDLE_ERROR_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT), "Failed to create pseudo-random number generator!"); HANDLE_ERROR_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL), "Failed to set seed!"); HANDLE_ERROR_CURAND(curandGenerateNormal(gen, d__n, N, 0, stddev), "Failed to generate variates on device!"); CUBLAS_ERROR( cublasSaxpy(handle, N, &alpha, d__n, 1, d__c, 1) ); HANDLE_ERROR_CURAND(curandDestroyGenerator(gen), "Failed to the cleanup!"); cublasDestroy(handle); HANDLE_ERROR( cudaFree( d__n ) ); } @ \subsection{Input/Output} \label{sec:inputoutput} The centroids and flux are displayed with plot.ly. <>= void centroiding::show_centroids(char *filename) { int nbyte; float *data, alpha; plotly_properties prop; stats S; S.setup(); alpha = RADIAN2ARCSEC; nbyte = N_LENSLET*N_SOURCE*2; CUBLAS_ERROR( cublasSscal(S.handle, nbyte, &alpha, d__c, 1) ); S.cleanup(); nbyte *= sizeof(float); HANDLE_ERROR( cudaHostAlloc( (void**)&data, nbyte, cudaHostAllocDefault ) ); HANDLE_ERROR( cudaMemcpy( data, d__c,nbyte, cudaMemcpyDeviceToHost ) ); prop.set("ztitle","[arcsec]"); prop.set("filename",filename); prop.aspect_ratio = 2*N_SOURCE; prop.set("zdata",data, _N_SIDE_LENSLET_, _N_SIDE_LENSLET_*N_SOURCE*2); imagesc(&prop); HANDLE_ERROR( cudaFreeHost( data ) ); } @ <>= void centroiding::show_flux(char *filename) { int nbyte; float *data; plotly_properties prop; nbyte = sizeof(float) *N_LENSLET*N_SOURCE; HANDLE_ERROR( cudaHostAlloc( (void**)&data, nbyte, cudaHostAllocDefault ) ); HANDLE_ERROR( cudaMemcpy( data, d__mass,nbyte, cudaMemcpyDeviceToHost ) ); prop.set("ztitle","[au]"); prop.set("filename",filename); prop.aspect_ratio = N_SOURCE; prop.set("zdata",data, _N_SIDE_LENSLET_, _N_SIDE_LENSLET_*N_SOURCE); imagesc(&prop); HANDLE_ERROR( cudaFreeHost( data ) ); }