% -*- mode: Noweb; noweb-code-mode: c-mode -*- @ [[BTBT]] contains a structure and the routines to implement a block matrix with Toeplitz--Block--Toeplitz blocks. \index{BTBT} \section{The files} \subsection{Header} <>= #ifndef __BTBT_H__ #define __BTBT_H__ #ifndef __UTILITIES_H__ #include "utilities.h" #endif #ifndef __ATMOSPHERE_H__ #include "atmosphere.h" #endif //#define BTBT_DEBUG struct BTBT { <> void setup(int n_x); void setup(int M_, int N_, int NT_, float *d__cov_); void setup(int M_, int N_, int MT_, int NT_, float *d__cov_); void cleanup(void); void info(void); void MVM(float *y, float *x); void MVM_i(float *y, float *x); void MVM_ii(float *y, float *x); void MVM_iii(float *y, float *x); void MVM_iv(float *y, float *x); void MVM_v(float *y, float *x); }; #endif // __BTBT_H__ @ \subsection{Source} <>= #include "BTBT.h" <> <> <> <> <> <> <> <> <> <> <> << MVM >> @ \section{Parameters} \label{sec:parameters} \index{BTBT!BTBT} A BTBT matrix is a $M\times N$ block matrix with $M$ and $N$ the number of block rows and columns respectively. Each block is a matrix of type Toeplitz--Block--Toeplitz meaning that each block contains $M_T\times N_T$ blocks of size $M_T\times N_T$. Both matrix level are Toeplitz. Thanks to the peculiar structure of the matrix, there is a total of $MN(M_T+N_T-1)^2$ unique elements, to compare to the number of elements in the full matrix, $MNM_T^2N_T^2$. The matrix is entirely defined with $M$, $N$, $M_T$ and $N_T$ and the $(M_T+N_T-1)^2\times MN$ matrix of unique elements. The parameters of the [[BTBT]] structure are: <>= int M, N, MT, MT2, NT, NT2, NU, NU2, NDFT, HALF_NDFT, NU_TOTAL, NF, NF2, ind_size, cov_size; char *mask; float2 *d__cov, *d__b, *d__c; float *d__alpha, *d__beta, n_full, n_comp, b_full, b_comp, cov_eval_et; unsigned int *d__mu, *d__xi; cufftHandle raster_plan, MVM_input_plan, MVM_output_plan; #ifdef BTBT_DEBUG float2 *cov; unsigned int *mu, *xi; #endif /* cublasHandle_t handle; */ /* cuComplex **Aptr, **Bptr, **Cptr; */ @ \section{Functions} \label{sec:functions} \subsection{Setup \& Cleanup} \label{sec:setup--cleanup} A [[BTBT]] structure is initialized with the number of blocks $[[M]]\times[[N]]$, the size of each block $[[MT]]\times[[NT]]$ and the covariance. \index{BTBT!BTBT!setup} <>= void BTBT::setup(int M_, int N_, int MT_, int NT_, float *d__cov_) { M = M_; N = N_; MT = MT_; MT2 = MT*MT; NT = NT_; NT2 = NT*NT; NU = MT+NT-1; NU2 = NU*NU; NDFT = round_up_to_nhp2(NU2); HALF_NDFT = NDFT/2 + 1; NU_TOTAL = NU2*M*N; mask = NULL; <> info(); <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"Covariance raster DFT"); } @ For square blocks, [[BTBT]] is initialized with: \index{BTBT!BTBT!setup} <>= void BTBT::setup(int M_, int N_, int NT_, float *d__cov_) { M = M_; N = N_; MT = NT = NT_; MT2 = NT2 = NT*NT; NU = 2*NT-1; NU2 = NU*NU; NDFT = round_up_to_nhp2(NU2); HALF_NDFT = NDFT/2 + 1; NU_TOTAL = NU2*M*N; mask = NULL; <> info(); <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"Covariance raster DFT"); } @ The memory is allocated with: <>= ind_size = sizeof(int); cov_size = sizeof(float2)*NU2*N*M; HANDLE_ERROR( cudaMalloc((void**)&d__mu, ind_size*NT*NT ) ); HANDLE_ERROR( cudaMalloc((void**)&d__xi, ind_size*MT*MT ) ); HANDLE_ERROR( cudaMalloc((void**)&d__b, N*sizeof(float2)*NDFT ) ); HANDLE_ERROR( cudaMemset(d__b, 0, N*sizeof(float2)*NDFT ) ); HANDLE_ERROR( cudaMalloc((void**)&d__c, M*sizeof(float2)*NDFT ) ); HANDLE_ERROR( cudaMalloc((void**)&d__cov, M*N*sizeof(float2)*NDFT ) ); HANDLE_ERROR( cudaMemset(d__cov, 0, M*N*sizeof(float2)*NDFT ) ); for (int k=0;k<(M*N);k++) HANDLE_ERROR( cudaMemcpy( (float *)(d__cov + k*HALF_NDFT), d__cov_ + k*NU2, NU2*sizeof(float), cudaMemcpyDeviceToDevice) ); int BATCH = M*N; INFO("\n@(CEO)>BTBT: Creating a 1D covariance FFT plan\n"); HANDLE_ERROR_CUFFT( cufftPlan1d(&raster_plan, NDFT, CUFFT_R2C, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(raster_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ INFO("\n@(CEO)>BTBT: Creating a 1D MVM input FFT plan\n"); BATCH = N; HANDLE_ERROR_CUFFT( cufftPlan1d(&MVM_input_plan, NDFT, CUFFT_R2C, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(MVM_input_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ INFO("\n@(CEO)>BTBT: Creating a 1D MVM output FFT plan\n"); BATCH = M; HANDLE_ERROR_CUFFT( cufftPlan1d(&MVM_output_plan, NDFT, CUFFT_C2R, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(MVM_output_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ //cublasCreate(&handle); @ <>= cudaMalloc((void**)&Aptr, M * sizeof(cuComplex *)); cudaMalloc((void**)&Bptr, M * sizeof(cuComplex *)); cudaMalloc((void**)&Cptr, M * sizeof(cuComplex *)); pointer_allocation LLL M,1 RRR (Aptr, Bptr, Cptr, d__c, HALF_NDFT, d__cov, HALF_NDFT*N, d__b, N); @ <>= __global__ void pointer_allocation(cuComplex **Aptr, cuComplex **Bptr, cuComplex **Cptr, float2 *c, int n_c, float2 *cov, int n_cov, float2 *b, int n_b) { int k; k = blockIdx.x; Aptr[k] = (cuComplex *) (c+k*n_c); Bptr[k] = (cuComplex *) (cov+k*n_cov); Cptr[k] = (cuComplex *) (b+k*n_b); } @ The multiplication $Cs$ are efficiently performed as follows \begin{enumerate} \item Construct polynomials $C(t)$ and $s(t)$ from the matrix $C$ and the vector $s$: \begin{equation} \label{eq:33} C(t) = \sum_{i_1=-(M_l-1)}^{N_l-1}\sum_{i_2=-(M_l-1)}^{N_l-1} c_{{i_1}{i_2}}t^{\lambda_{{i_1}{i_2}}}, \end{equation} where $c_{{i_1}{i_2}}$ is an entry of $C$ and $\lambda_{{i_1}{i_2}}=(M_l+i_1-1)(M_l+N_l-1)+(M_l+i_2-1)$ and \begin{equation} \label{eq:34} S(t) = \sum_{j_1=1}^{N_l}\sum_{j_2=1}^{N_l} s_{{j_1}{j_2}}t^{\mu_{{j_1}{j_2}}}, \end{equation} where $\mu_{{j_1}{j_2}}=(M_l+N_l)(N_l-1)-j_1(M_l+N_l-1)-j_2$. \item Compute $P(t)=C(t)\times S(t)$ using the Discrete Fourier Transform. \item The entry $b_{{j_1}{j_2}}$ of the vector $b$ is $b_{{j_1}{j_2}}=p_{\xi_{{j_1}{j_2}}}$, where $\xi_{{j_1}{j_2}}=(M_l+N_l)(M_l+N_l-1)-(j_1+1)(M_l+N_l-1)-(j_2+1).$ \end{enumerate} with $(j_1,j_2)\in [0,\dots,[[N]]-1]$. The ordering index are computed next: <>= dim3 blockDim(16,16); dim3 gridDim( 1+MT/16 , 1+MT/16 ); ordering LLL gridDim,blockDim RRR(d__mu, d__xi, MT, NT); #ifdef BTBT_DEBUG mu = (unsigned int *)malloc( ind_size ); HANDLE_ERROR( cudaMemcpy( mu, d__mu, ind_size, cudaMemcpyDeviceToHost) ); xi = (unsigned int *)malloc( ind_size ); HANDLE_ERROR( cudaMemcpy( xi, d__xi, ind_size, cudaMemcpyDeviceToHost) ); INFO("\n|( i) mu xi|\n"); for (int i=0;i>= __global__ void ordering(unsigned int *mu, unsigned int *xi, int m, int n) { int j1, j2, k; j1 = blockIdx.x * blockDim.x + threadIdx.x; j2 = blockIdx.y * blockDim.y + threadIdx.y; if ( (j1>= <> #ifdef BTBT_DEBUG cov = (float2*)malloc(cov_size); HANDLE_ERROR( cudaMemcpy( cov, d__cov, cov_size, cudaMemcpyDeviceToHost) ); INFO("\n@(CEO)>BTBT: raster covariance\n"); for (int k=0;k>= HANDLE_ERROR_CUFFT( cufftExecR2C(raster_plan, (cufftReal*)d__cov, d__cov), "Unable to execute plan!"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ Memory is freed with: \index{BTBT!BTBT!cleanup} <>= void BTBT::cleanup(void) { INFO("@(CEO)>BTBT: freeing memory!\n"); cufftDestroy(raster_plan); cufftDestroy(MVM_input_plan); cufftDestroy(MVM_output_plan); HANDLE_ERROR( cudaFree( d__mu ) ); HANDLE_ERROR( cudaFree( d__xi ) ); HANDLE_ERROR( cudaFree( d__b ) ); HANDLE_ERROR( cudaFree( d__c ) ); HANDLE_ERROR( cudaFree( d__cov ) ); #ifdef BTBT_DEBUG free(mu); free(xi); #endif /* cublasDestroy(handle); */ } @ \subsection{Input/Output} \label{sec:inputoutput} The main parameters of [[BTBT]] are displayed with the [[info]] routine: \index{BTBT!BTBT!info} <>= void BTBT::info(void) { INFO("\n\x1B[1;42m@(CEO)>BTBT:\x1B[;42m\n"); <> INFO("----------------------------------------------------\x1B[0m\n"); } <>= INFO(" . number of outermost blocks : %dX%d\n",M,N); INFO(" . size of outermost blocks : %dX%d\n",MT,NT); INFO(" . size of innermost blocks : %dX%d\n",MT,NT); n_full = powf(NT,4)*M*N; INFO(" . DFT length : %d\n",NDFT); INFO(" . full matrix elements # : %.3E\n",n_full); n_comp = M*N*NU2; INFO(" . compressed matrix elements # : %.3E\n",n_comp); INFO(" . compression factor : %4.0f \n",n_full/n_comp); float mb = powf(2,20); b_full = n_full*sizeof(float); if (b_full>mb) { INFO(" . full matrix storage [MB] : %6.1f\n",b_full/mb); } else { INFO(" . full matrix storage [KB] : %6.1f\n",b_full/1024.0); } b_comp = n_comp*sizeof(float2); if (b_comp>mb) { INFO(" . compressed matrix storage [MB]: %6.1f\n",b_comp/mb); } else { INFO(" . compressed matrix storage [KB]: %6.1f\n",b_comp/1024.0); } INFO(" . compression factor : %4.0f \n",b_full/b_comp); @ \subsection{Matrix--to--vector product} \label{sec:matr-vect-prod} The MVM routine computes the matrix--to--vector multiplication $y=C_{\vec\alpha\cdot\vec\beta}s$. \index{BTBT!BTBT!MVM} << MVM >>= void BTBT::MVM(float *y, float *x) { /* stopwatch tid; */ /* tid.tic(); */ <> /* tid.toc("STEP 1"); */ /* tid.tic(); */ <> /* tid.toc("STEP 2"); */ /* tid.tic(); */ <> /* tid.toc("STEP 3"); */ /* HANDLE_ERROR( cudaThreadSynchronize() ); */ /* float2 *b; */ /* b = (float2*)malloc(M*sizeof(float2)*NDFT); */ /* HANDLE_ERROR( cudaMemcpy( b, d__c, M*sizeof(float2)*NDFT, cudaMemcpyDeviceToHost) ); */ /* for (int i=0;i<(10);i++) { */ /* fprintf(stdout,"|(%3d) [%+6.4E;%+6.4E]\n",i,b[i].x,b[i].y); */ /* } */ /* free(b); */ /* tid.tic(); */ <> /* float2 *c; */ /* c = (float2*)malloc(M*sizeof(float2)*NDFT); */ /* HANDLE_ERROR( cudaMemcpy( c, d__c, M*sizeof(float2)*NDFT, cudaMemcpyDeviceToHost) ); */ /* for (int i=0;i<(NDFT);i++) { */ /* printf("|(%3d) [%+6.4E;%+6.4E]\n",i,c[i].x,c[i].y); */ /* } */ /* free(c); */ /* tid.toc("STEP 4"); */ /* tid.tic(); */ if (mask==NULL) { <> } else { <> } /* tid.toc("STEP 5"); */ } void BTBT::MVM_i(float *y, float *x) { <> float *b; b = (float*)malloc(N*sizeof(float2)*NDFT); HANDLE_ERROR( cudaMemcpy( b, (float *)d__b, N*sizeof(float2)*NDFT, cudaMemcpyDeviceToHost) ); /* for (int i=0;i<(N*NDFT);i++) { fprintf(stdout,"|(%3d) [%+6.4E;%+6.4E]\n",i,b[i].x,b[i].y); } */ float s; for (int i=0;i> } void BTBT::MVM_iii(float *y, float *x) { dim3 blockDim(256,1); dim3 gridDim( 1+NT2/256,N); <> } void BTBT::MVM_iv(float *y, float *x) { dim3 blockDim(256,1); dim3 gridDim( 1+NT2/256,N); <> } void BTBT::MVM_v(float *y, float *x) { dim3 blockDim(256,1); dim3 gridDim( 1+NT2/256,N); if (mask==NULL) { <> } else { <> } } @ The vector $s$ is made of $N$ components of length $[[NT]]^2$: $$s= \left[ \begin{array}{c} s_x \\ s_y \end{array} \right] $$ and lets define another complex vector $b$ also made of two components but of length [[NU2]]: $$b=\left[ \begin{array}{c} b_x \\ b_y \end{array} \right] $$ The matrix--to--vector multiplication $y=C_{\vec\alpha\cdot\vec\beta}s$ is derived through the following steps: \begin{enumerate} \item input allocation and ordering: \begin{itemize} \item the $s_x$ components of $s$ is allocated into the real part of the complex vector $b_x$ according to the ordering in vector $\mu$ i.e. $b_x[\mu].x=s_x$, \item the $s_y$ components of $s$ is allocated into the real part of the complex vector $b_y$ according to the ordering in vector $\mu$ i.e. $b_y[\mu].x=s_y$, \end{itemize} <>= dim3 blockDim(256,1); dim3 gridDim( 1+NT2/256,N); mvm_input_order LLL gridDim,blockDim RRR ( d__b, HALF_NDFT, x, NT2, d__mu); @ using the kernel: <>= __global__ void mvm_input_order(float2 *x_out, int n_x_out, float *x_in, int n_x_in, unsigned int *ind) { int k, n; float *real_x_out; k = blockIdx.x * blockDim.x + threadIdx.x; n = blockIdx.y; if (k>= HANDLE_ERROR_CUFFT( cufftExecR2C(MVM_input_plan, (cufftReal*)d__b, d__b), "Unable to execute plan forward FT with MVM plan"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ \item $\tilde b$ and $\tilde T$ are multiplied element wise i.e. $$\tilde c = \tilde b\tilde T = \left[ \begin{array}{c} \tilde b_x.x\tilde T_{xx}.x - \tilde b_x.y\tilde T_{xx}.y + \tilde b_y.x\tilde T_{xy}.x - \tilde b_y.y\tilde T_{xy}.y \\ \tilde b_x.x\tilde T_{xx}.y + \tilde b_x.y\tilde T_{xx}.x + \tilde b_y.x\tilde T_{xy}.y + \tilde b_y.y\tilde T_{xy}.x \end{array} \right] $$ <>= blockDim = dim3(256,1); gridDim = dim3(ceilf(HALF_NDFT/256.0),1); cpx_mult LLL gridDim,blockDim RRR (d__c, d__cov, d__b, HALF_NDFT, M, N); @ using the kernel: <>= __global__ void cpx_mult(float2* c, float2 *x1, float2*x2, int n_x, int m_in, int n_in) { int k, l, i, j, p, q; k = blockIdx.x * blockDim.x + threadIdx.x; if (k>= cuComplex alpha, beta; alpha.x = 1.0; alpha.y = 0.0; beta.x = 1.0; beta.y = 0.0; cublasCgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, HALF_NDFT, 1, N, &alpha, (const cuComplex **)Aptr, HALF_NDFT, (const cuComplex **)Bptr, N, &beta, (cuComplex **)Cptr, HALF_NDFT, M); @ \item the inverse Fourier transform of $\tilde c$ is computed i.e. $c=\mathcal F^{-1} [\tilde c]$, <>= HANDLE_ERROR_CUFFT( cufftExecC2R(MVM_output_plan, d__c, (cufftReal*)d__c), "Unable to execute inverse FT with MVM plan"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ \item the real part of $c$ is affected into vector $y$ according to the ordering in vector $\xi$ i.e. $y=c[\xi].x$. <>= blockDim = dim3(256,1); gridDim = dim3( 1+MT2/256,M); mvm_output_order LLL gridDim,blockDim RRR (y, MT2, d__c, HALF_NDFT, d__xi); @ using the kernel: <>= __global__ void mvm_output_order(float *x_out, int n_x_out, float2 *x_in, int n_x_in, unsigned int *ind) { int k, m, ndft; float *real_x_in; k = blockIdx.x * blockDim.x + threadIdx.x; m = blockIdx.y; if (k>= blockDim = dim3(256,1); gridDim = dim3( 1+MT2/256,M); mvm_output_order_mask LLL gridDim,blockDim RRR (y, MT2, d__c, HALF_NDFT, d__xi, mask); @ using the kernel: <>= __global__ void mvm_output_order_mask(float *x_out, int n_x_out, float2 *x_in, int n_x_in, unsigned int *ind, char *mask) { int k, m, ndft; float *real_x_in; k = blockIdx.x * blockDim.x + threadIdx.x; m = blockIdx.y; if (k0) ? real_x_in[ind[k]]/ndft : 0; } } @ \end{enumerate} New matrix--to--vector multiplication routines are defined for test purposes. The next routine implement the MVM for an identity matrix << MVM (Test 1)>>= void BTBT::MVM(float *y, float *x) { printf("@(CEO)>BTBT: MVM (Test 1)\n"); cublasHandle_t handle; cublasCreate(&handle); cublasScopy(handle, 10, x, 1, y, 1); cublasDestroy(handle); } << MVM (Test 2)>>= void BTBT::MVM(float *y, float *x) { printf("@(CEO)>BTBT: MVM (Test 2)\n"); cublasHandle_t handle; cublasCreate(&handle); for (int k=0;k>= void BTBT::setup(int n_x) { N = n_x; }