% -*- mode: Noweb; noweb-code-mode: c-mode -*- @ [[GBTBT]] contains a structure and the routines to implement a block matrix with Toeplitz--Block--Toeplitz blocks. \index{GBTBT} \section{The files} \subsection{Header} <>= #ifndef __GBTBT_H__ #define __GBTBT_H__ #ifndef __UTILITIES_H__ #include "utilities.h" #endif #ifndef __ATMOSPHERE_H__ #include "atmosphere.h" #endif //#define GBTBT_DEBUG struct GBTBT { <> void setup(int n_x); void setup(int M_, int N_, int NT_, const float *d__cov_); void setup(int M_, int N_, int MT_, int NT_, const float *d__cov_); void setup(int M_, int N_, int *MT_, int NT_, const float *d__cov_); void cleanup(void); void info(void); void MVM(float *y, float *x); }; #endif // __GBTBT_H__ @ \subsection{Source} <>= #include "GBTBT.h" <> <> <> <> <> <> <> <> <> <> << MVM >> @ A GBTBT matrix is a $M\times N$ block matrix with $M$ and $N$ the number of block rows and columns respectively. Each block is a matrix of type Toeplitz--Block--Toeplitz meaning that each block contains $M_{T,i}\times N_T$ blocks of size $M_T\times N_T$ with $i \in [0,M-1]$. Both matrix level are Toeplitz. Thanks to the peculiar structure of the matrix, there is a total of $N\sum_{i=0}^{M-1}(M_{T,i}+N_T-1)^2$ unique elements, to compare to the number of elements in the full matrix, $N\sum_{i=0}^{M-1}M_{T,i}^2N_T^2$. The matrix is entirely defined with $M$, $N$, $M_{T,i}$ and $N_T$ and the $(M_{T,i}+N_T-1)^2\times MN$ matrix of unique elements. \section{Parameters} \label{sec:parameters} \index{GBTBT!GBTBT} The parameters of the [[GBTBT]] structure are: <>= int M, N, NT, NT2, NDFT, HALF_NDFT, NU_TOTAL, NF, NF2, ind_size, cov_size, MT2_TOTAL, MT_size, MAX_MT; int *MT, *MT2, *NU, *NU2, *CS_MT2, *d__MT, *d__MT2, *d__NU, *d__NU2, *d__CS_MT2; char *mask; float2 *d__cov, *d__b, *d__c; float *d__alpha, *d__beta, n_full, n_comp, b_full, b_comp, cov_eval_et; unsigned int *d__mu, *d__xi; cufftHandle raster_plan, MVM_input_plan, MVM_output_plan; #ifdef GBTBT_DEBUG float2 *cov; unsigned int *mu, *xi; #endif @ \section{Functions} \label{sec:functions} \subsection{Setup \& Cleanup} \label{sec:setup--cleanup} A [[GBTBT]] structure is initialized with the number of blocks $[[M]]\times[[N]]$, the size of each block $[[MT]]\times[[NT]]$ and the covariance: \begin{itemize} \item here we pass the vector $M_{T,i}$ \index{GBTBT!GBTBT!setup} <>= void GBTBT::setup(int M_, int N_, int *MT_, int NT_, const float *d__cov_) { M = M_; N = N_; MT_size = sizeof(int)*M; MT = (int *)malloc(MT_size); for (int k=0;k> info(); stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"Covariance raster DFT"); } @ \item all the blocks have the same size $M_{T,i} \equiv M_T, \forall i \in [0,M-1]$: \index{GBTBT!GBTBT!setup} <>= void GBTBT::setup(int M_, int N_, int MT_, int NT_, const float *d__cov_) { M = M_; N = N_; MT_size = sizeof(int)*M; MT = (int *)malloc(MT_size); for (int k=0;k> info(); stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"Covariance raster DFT"); } @ \item all the blocks are square $M_{T,i} \equiv M_T \equiv N_T, \forall i \in [0,M-1]$: \index{GBTBT!GBTBT!setup} <>= void GBTBT::setup(int M_, int N_, int NT_, const float *d__cov_) { M = M_; N = N_; NT = NT_; MT_size = sizeof(int)*M; MT = (int *)malloc(MT_size); for (int k=0;k> info(); stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"Covariance raster DFT"); } @ \end{itemize} The memory is allocated with: <>= HANDLE_ERROR( cudaMalloc((void**)&d__MT, MT_size) ); MT2 = (int *)malloc(MT_size); HANDLE_ERROR( cudaMalloc((void**)&d__MT2, MT_size) ); CS_MT2 = (int *)malloc(MT_size); HANDLE_ERROR( cudaMalloc((void**)&d__CS_MT2, MT_size) ); NU = (int *)malloc(MT_size); HANDLE_ERROR( cudaMalloc((void**)&d__NU, MT_size) ); NU2 = (int *)malloc(MT_size); HANDLE_ERROR( cudaMalloc((void**)&d__NU2, MT_size) ); NU_TOTAL = 0; MT2_TOTAL = 0; MAX_MT = 0; int _NDFT; NDFT = 0; //printf("@(CEO)>GBTBT\n"); for (int k=0; kNDFT) ? _NDFT : NDFT; MAX_MT = (MAX_MTGBTBT: Creating a 1D covariance FFT plan\n"); HANDLE_ERROR_CUFFT( cufftPlan1d(&raster_plan, NDFT, CUFFT_R2C, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(raster_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ INFO("\n@(CEO)>GBTBT: Creating a 1D MVM input FFT plan\n"); BATCH = M*N; HANDLE_ERROR_CUFFT( cufftPlan1d(&MVM_input_plan, NDFT, CUFFT_R2C, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(MVM_input_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ INFO("\n@(CEO)>GBTBT: Creating a 1D MVM output FFT plan\n"); BATCH = M; HANDLE_ERROR_CUFFT( cufftPlan1d(&MVM_output_plan, NDFT, CUFFT_C2R, BATCH), "Unable to create plan"); /*HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(MVM_output_plan, CUFFT_COMPATIBILITY_FFTW_PADDING), "Unable to set compatibility mode to native");*/ @ The multiplication $Cs$ are efficiently performed as follows \begin{enumerate} \item Construct polynomials $C(t)$ and $s(t)$ from the matrix $C$ and the vector $s$: \begin{equation} \label{eq:33} C(t) = \sum_{i_1=-(M_l-1)}^{N_l-1}\sum_{i_2=-(M_l-1)}^{N_l-1} c_{{i_1}{i_2}}t^{\lambda_{{i_1}{i_2}}}, \end{equation} where $c_{{i_1}{i_2}}$ is an entry of $C$ and $\lambda_{{i_1}{i_2}}=(M_l+i_1-1)(M_l+N_l-1)+(M_l+i_2-1)$ and \begin{equation} \label{eq:34} S(t) = \sum_{j_1=1}^{N_l}\sum_{j_2=1}^{N_l} s_{{j_1}{j_2}}t^{\mu_{{j_1}{j_2}}}, \end{equation} where $\mu_{{j_1}{j_2}}=(M_l+N_l)(N_l-1)-j_1(M_l+N_l-1)-j_2$. \item Compute $P(t)=C(t)\times S(t)$ using the Discrete Fourier Transform. \item The entry $b_{{j_1}{j_2}}$ of the vector $b$ is $b_{{j_1}{j_2}}=p_{\xi_{{j_1}{j_2}}}$, where $\xi_{{j_1}{j_2}}=(M_l+N_l)(M_l+N_l-1)-(j_1+1)(2N_l-1)-(j_2+1).$ \end{enumerate} with $(j_1,j_2)\in [0,\dots,[[N]]-1]$. @ The 1D Fourier transform is now applied to the raster covariance <>= <> #ifdef GBTBT_DEBUG cov = (float2*)malloc(cov_size); HANDLE_ERROR( cudaMemcpy( cov, d__cov, cov_size, cudaMemcpyDeviceToHost) ); INFO("\n@(CEO)>GBTBT: raster covariance\n"); for (int k=0;k>= HANDLE_ERROR_CUFFT( cufftExecR2C(raster_plan, (cufftReal*)d__cov, d__cov), "Unable to execute plan!"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ Memory is freed with: \index{GBTBT!GBTBT!cleanup} <>= void GBTBT::cleanup(void) { INFO("@(CEO)>GBTBT: freeing memory!\n"); cufftDestroy(raster_plan); cufftDestroy(MVM_input_plan); cufftDestroy(MVM_output_plan); free( MT ); HANDLE_ERROR( cudaFree( d__MT ) ); free( MT2 ); HANDLE_ERROR( cudaFree( d__MT2 ) ); free( CS_MT2 ); HANDLE_ERROR( cudaFree( d__CS_MT2 ) ); free( NU ); HANDLE_ERROR( cudaFree( d__NU ) ); free( NU2 ); HANDLE_ERROR( cudaFree( d__NU2 ) ); HANDLE_ERROR( cudaFree( d__mu ) ); HANDLE_ERROR( cudaFree( d__xi ) ); HANDLE_ERROR( cudaFree( d__b ) ); HANDLE_ERROR( cudaFree( d__c ) ); HANDLE_ERROR( cudaFree( d__cov ) ); #ifdef GBTBT_DEBUG free(mu); free(xi); #endif } @ \subsection{Input/Output} \label{sec:inputoutput} The main parameters of [[GBTBT]] are displayed with the [[info]] routine: \index{GBTBT!GBTBT!info} <>= void GBTBT::info(void) { INFO("\n\x1B[1;42m@(CEO)>GBTBT:\x1B[;42m\n"); <> INFO("----------------------------------------------------\x1B[0m\n"); } <>= INFO(" . number of outermost blocks : %dX%d\n",M,N); INFO(" . size of outermost blocks : %dX%d\n",MT[0],NT); INFO(" . size of innermost blocks : %dX%d\n",MT[0],NT); n_full = powf(NT,4)*M*N; INFO(" . DFT length : %d\n",NDFT); INFO(" . full matrix elements # : %.3E\n",n_full); n_comp = NU_TOTAL; INFO(" . compressed matrix elements # : %.3E\n",n_comp); INFO(" . compression factor : %4.0f \n",n_full/n_comp); float mb = powf(2,20); b_full = n_full*sizeof(float); if (b_full>mb) { INFO(" . full matrix storage [MB] : %6.1f\n",b_full/mb); } else { INFO(" . full matrix storage [KB] : %6.1f\n",b_full/1024.0); } b_comp = n_comp*sizeof(float2); if (b_comp>mb) { INFO(" . compressed matrix storage [MB]: %6.1f\n",b_comp/mb); } else { INFO(" . compressed matrix storage [KB]: %6.1f\n",b_comp/1024.0); } INFO(" . compression factor : %4.0f \n",b_full/b_comp); @ \subsection{Matrix--to--vector product} \label{sec:matr-vect-prod} The MVM routine computes the matrix--to--vector multiplication $y=C_{\vec\alpha\cdot\vec\beta}s$. \index{GBTBT!GBTBT!MVM} << MVM >>= void GBTBT::MVM(float *y, float *x) { // stopwatch tid; // tid.tic(); <> // tid.toc("STEP 1"); // tid.tic(); <> // tid.toc("STEP 2"); // tid.tic(); <> // tid.toc("STEP 3"); // tid.tic(); <> // tid.toc("STEP 4"); // tid.tic(); if (mask==NULL) { <> } else { <> } // tid.toc("STEP 5"); } @ The vector $s$ is made of $N$ components of length $[[NT]]^2$: $$s= \left[ \begin{array}{c} s_x \\ s_y \end{array} \right] $$ and lets define another complex vector $b$ also made of two components but of length [[NU2]]: $$b=\left[ \begin{array}{c} b_x \\ b_y \end{array} \right] $$ The matrix--to--vector multiplication $y=C_{\vec\alpha\cdot\vec\beta}s$ is derived through the following steps: \begin{enumerate} \item input allocation and ordering: \begin{itemize} \item the $s_x$ components of $s$ is allocated into the real part of the complex vector $b_x$ according to the ordering in vector $\mu$ (\ref{BTBT.eq:34}) i.e. $b_x[\mu].x=s_x$, \item the $s_y$ components of $s$ is allocated into the real part of the complex vector $b_y$ according to the ordering in vector $\mu$ i.e. $b_y[\mu].x=s_y$, \end{itemize} <>= dim3 blockDim(16,16); dim3 gridDim( 1+NT/16,1+NT/16,N); gmvm_input_order LLL gridDim,blockDim RRR ( d__b, HALF_NDFT, x, NT, d__MT, M, N); @ using the kernel: <>= __global__ void gmvm_input_order(float2 *x_out, int n_x_out, float *x_in, int n_x_in, int *m_in, int N_MT, int N_NT) { int j1, j2, k, col, ind, n, m, l; float *real_x_out; j1 = blockIdx.x * blockDim.x + threadIdx.x; j2 = blockIdx.y * blockDim.y + threadIdx.y; col = blockIdx.z; n = n_x_in; if ( (j1>= HANDLE_ERROR_CUFFT( cufftExecR2C(MVM_input_plan, (cufftReal*)d__b, d__b), "Unable to execute plan forward FT with MVM plan"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ \item $\tilde b$ and $\tilde T$ are multiplied element wise i.e. $$\tilde c = \tilde b\tilde T = \left[ \begin{array}{c} \tilde b_x.x\tilde T_{xx}.x - \tilde b_x.y\tilde T_{xx}.y + \tilde b_y.x\tilde T_{xy}.x - \tilde b_y.y\tilde T_{xy}.y \\ \tilde b_x.x\tilde T_{xx}.y + \tilde b_x.y\tilde T_{xx}.x + \tilde b_y.x\tilde T_{xy}.y + \tilde b_y.y\tilde T_{xy}.x \end{array} \right] $$ <>= blockDim = dim3(256,1); gridDim = dim3(ceilf(HALF_NDFT/256.0),M); gcpx_mult LLL gridDim,blockDim RRR (d__c, d__cov, d__b, HALF_NDFT, M, N); @ using the kernel: <>= __global__ void gcpx_mult(float2* c, float2 *x1, float2*x2, int n_x, int m_in, int n_in) { int k, l, i, j, p, q; k = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y; if (k>= HANDLE_ERROR_CUFFT( cufftExecC2R(MVM_output_plan, d__c, (cufftReal*)d__c), "Unable to execute inverse FT with MVM plan"); //HANDLE_ERROR( cudaThreadSynchronize() ); @ \item the real part of $c$ is affected into vector $y$ according to the ordering in vector $\xi$ (\ref{BTBT.eq:34}) i.e. $y=c[\xi].x$. <>= blockDim = dim3(16,16); gridDim = dim3( 1+MAX_MT/16, 1+MAX_MT/16, M); gmvm_output_order LLL gridDim,blockDim RRR (y, d__MT, d__CS_MT2, d__c, HALF_NDFT, NT); @ using the kernel: <>= __global__ void gmvm_output_order(float *x_out, int *m_in, int*cs_m2_in, float2 *x_in, int n_x_in, int n) { int j1, j2, k, row, ind, m, ndft; float *real_x_in; j1 = blockIdx.x * blockDim.x + threadIdx.x; j2 = blockIdx.y * blockDim.y + threadIdx.y; row = blockIdx.z; m = m_in[row]; if ( (j1>= blockDim = dim3(16,16); gridDim = dim3( 1+MAX_MT/16, 1+MAX_MT/16, M); gmvm_output_order_mask LLL gridDim,blockDim RRR (y, d__MT, d__CS_MT2, d__c, HALF_NDFT, NT, mask); @ using the kernel: <>= __global__ void gmvm_output_order_mask(float *x_out, int *m_in, int*cs_m2_in, float2 *x_in, int n_x_in, int n, char *mask) { int j1, j2, k, row, ind, m, ndft; float *real_x_in; j1 = blockIdx.x * blockDim.x + threadIdx.x; j2 = blockIdx.y * blockDim.y + threadIdx.y; row = blockIdx.z; m = m_in[row]; if ( (j10) ? real_x_in[ind]/ndft : 0; } } @ \end{enumerate} New matrix--to--vector multiplication routines are defined for test purposes. The next routine implement the MVM for an identity matrix << MVM (Test 1)>>= void GBTBT::MVM(float *y, float *x) { printf("@(CEO)>GBTBT: MVM (Test 1)\n"); cublasHandle_t handle; cublasCreate(&handle); cublasScopy(handle, 10, x, 1, y, 1); cublasDestroy(handle); } << MVM (Test 2)>>= void GBTBT::MVM(float *y, float *x) { printf("@(CEO)>GBTBT: MVM (Test 2)\n"); cublasHandle_t handle; cublasCreate(&handle); for (int k=0;k>= void GBTBT::setup(int n_x) { N = n_x; }