% -*- mode: Noweb; noweb-code-mode: c-mode -*- @ [[aaStats]] contains a structure and the routines to compute the covariance matrix of the angle--of--arrival. \index{aaStats} \section{The files} \subsection{Header} <>= #ifndef __AASTATS_H__ #define __AASTATS_H__ #ifndef __UTILITIES_H__ #include "utilities.h" #endif #ifndef __ATMOSPHERE_H__ #include "atmosphere.h" #endif //#define AASTATS_DEBUG //#define PASTATS_DEBUG <> <> @ The structures are : <>= struct aaStats { <> void setup(int N, const atmosphere *atm, float lenslet_pitch, const source *src, int N_SRC); void cleanup(void); void info(int kappa, float d); float variance(void); void toFile(char *filename); }; @ and <>= struct paStats { <> void setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, source *phase_src, int N_P_SRC, const source *slopes_src, int N_S_SRC); void setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, const source *slopes_src, int N_S_SRC, float z_radius); void cleanup(void); void info(int kappa, float d); /* void MVM(float *in_vector, float *out_vector, float d1, float g1, int N1, float d2, float g2, int N2); */ void MVM(float *out_vector, float *in_vector, float d1, float g1, int N1, float d2, int N2, atmosphere *atm, source *phase_src, int N_P_SRC, source *slopes_src, int N_S_SRC) ; void toFile(char *filename); void variance(void); }; #endif // __AASTATS_H__ @ \subsection{Source} <>= #include "aaStats.h" <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> <> @ \section{The theory} Shack--Hartmann WFS centroids are a measure of the spatial derivatives of the wavefront averaged on each subaperture. These derivatives are often referred as angle of arrivals $\vec \alpha=(\alpha_x\alpha_y,)$: \begin{eqnarray} \label{eq:1} \alpha_x &=& {\lambda\over 2\pi d^2} {\partial\varphi\over\partial x} \ast \Pi\left(x\over d\right)\Pi\left(y\over d\right),\\ \alpha_y &=& {\lambda\over 2\pi d^2} {\partial\varphi\over\partial y} \ast \Pi\left(x\over d\right)\Pi\left(y\over d\right), \end{eqnarray} where $\lambda$ is the sensing wavelength and $d$ in the size of one subaperture. The covariance is derived from the power spectrum density of the angle of arrivals $W_{\vec \alpha \cdot\ \vec \beta}(\vec f)$: \begin{equation} \label{eq:2} W_{\vec \alpha\cdot\vec \beta}(\vec f) = \lambda^2\left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\vec f) \end{equation} $W_\varphi(f)$ is the wavefront power spectrum density given by \begin{equation} \label{eq:3} W_\varphi(f) = 0.0229 r_0^{-5/3} \left( f^2 + {1\over\mathcal L_0^2} \right)^{-11/6}. \end{equation} $G(\vec f)$ is the point spread function of a subaperture of the WFS, \begin{equation} \label{eq:4} G(\vec f) = {\sin(\pi d f_x) \over \pi d f_x }{ \sin(\pi d f_y) \over \pi d f_y } . \end{equation} The covariance $C_{\vec \alpha \cdot\ \vec \beta}(\vec\rho)$ is derived from the Wiener--Khinchine theorem: \begin{equation} \label{eq:5} C_{\vec \alpha \cdot\ \vec \beta}(\vec\rho) = \mathcal F^{-1} \left[ W_{\vec \alpha\cdot\vec \beta}(\vec f) \right] (\vec\rho) \end{equation} where $\mathcal F^{-1}$ stands for the inverse Fourier transform. The covariance between $\varphi$ and $\vec \alpha$ is derived from the cross--power spectrum density given by: \begin{equation} \label{eq:6} W_{\varphi\vec \alpha}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\vec f) \end{equation} $W_\varphi(f)$ is the wavefront power spectrum density given by \begin{equation} \label{eq:7} W_\varphi(f) = 0.0229 r_0^{-5/3} \left( f^2 + {1\over\mathcal L_0^2} \right)^{-11/6}. \end{equation} $G(\vec f)$ is the point spread function of a subaperture of the WFS, \begin{equation} \label{eq:8} G(\vec f) = {\sin(\pi d f_x) \over \pi d f_x }{ \sin(\pi d f_y) \over \pi d f_y } . \end{equation} The covariance $C_{\varphi \vec \alpha}(\vec\rho)$ is derived from the Wiener--Khinchine theorem: \begin{equation} \label{eq:9} C_{\varphi \vec \alpha}(\vec\rho) = \mathcal F^{-1} \left[ W_{\varphi\vec \alpha}(\vec f) \right] (\vec\rho) \end{equation} where $\mathcal F^{-1}$ stands for the inverse Fourier transform. \section{Parameters} \label{sec:parameters} The parameters of the [[aaStats]] structure are: <>= int N, N2, NU, NU2, NF, NF2, psd_size, cov_size, ind_size; float2 *d__psd; float *d__cov, *d__alpha, *d__beta, n_full, n_comp, b_full, b_comp, cov_eval_et, sampling; cufftHandle plan; #if defined(AASTATS_DEBUG) || defined(PASTATS_DEBUG) float2 *psd, *cov; #endif int N_SRC2; @ The parameters of the [[paStats]] structure are: <>= int osf, M, shift; int *M_LAYER; <> @ \section{Angle--of--arrival covariance matrix functions} \label{sec:aa-functions} \index{aaStats!aaStats} The covariance matrix is computed for a square lenslet array of size $[[N]]\times[[N]]$ and of pitch [[sampling]]. The covariance matrix depends on the atmosphere parameters and altitude profile. The covariance matrix is computed for a vector of [[N_SRC]] sources. The covariance matrix is written: \begin{eqnarray} \label{eq:10} \Theta = \left[ \begin{array}{ccc} \Theta_{11}&\cdots&\Theta_{1[[N_SRC]]} \\ \vdots &\ddots & \vdots \\ \Theta_{[[N_SRC]]1} & \vdots & \Theta_{[[N_SRC]][[N_SRC]]} \end{array} \right] \end{eqnarray} with \begin{eqnarray} \label{eq:11} \Theta_{ij} = \left[ \begin{array}{cc} \Theta_{{x_i}{x_j}} & \Theta_{{x_i}{y_j}} \\ \Theta_{{y_i}{x_j}} & \Theta_{{y_i}{y_j}} \end{array} \right]. \end{eqnarray} Each matrix $\Theta_{{x_i}{x_j}}$ is a two--level recursive block Toeplitz (2RBT) $[[N]]^2\times[[N]]^2$ matrix. Toeplitz matrices \cite{GolubVanLoan} are fully defined with their first row and column. So the matrix $\Theta_{{x_i}{x_j}}$ is reduced to a $[[NU]]=(2[[N]]-1)^2$ elements vector $t_\Theta^{{x_i}{x_j}}$ and the matrix $\Theta_{ij}$ becomes the matrix \begin{equation} \label{eq:12} T_{ij} = \left[ t_\Theta^{{x_i}{x_j}} t_\Theta^{{x_i}{y_j}} t_\Theta^{{y_i}{x_j}} t_\Theta^{{y_i}{y_j}} \right]. \end{equation} Finally the Toeplitz compressed version of the matrix $\Theta$ is written: \begin{equation} \label{eq:13} T = \left[ t_\Theta^{{x_1}{x_1}} t_\Theta^{{x_1}{y_1}} \cdots t_\Theta^{{x_1}{x_[[N_SRC]]}} t_\Theta^{{x_1}{y_[[N_SRC]]}} \cdots t_\Theta^{{y_[[N_SRC]]}{x_1}} t_\Theta^{{y_[[N_SRC]]}{y_1}} \cdots t_\Theta^{{y_[[N_SRC]]}{x_[[N_SRC]]}} t_\Theta^{{y_[[N_SRC]]}{y_[[N_SRC]]}} \right]. \end{equation} @ \subsection{Setup \& Cleanup} \label{sec:setup--cleanup} \index{aaStats!aaStats!setup} <>= void aaStats::setup(int N_, const atmosphere *atm, float lenslet_pitch, const source *src, int N_SRC) { N_SRC2 = N_SRC*N_SRC; int BATCH = 1; <> psd_size = sizeof(float2)*NF2; <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"AA covariance evaluation"); info(kappa, sampling); } @ The parameters are initialized in the setup function: <>= float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; N = N_; N2 = N*N; NU = 2*N-1; NU2 = NU*NU; NF = 4096; NF2 = NF*NF; kappa = 4; sampling = lenslet_pitch; psd_size = sizeof(float2)*NF2*4; cov_size = sizeof(float)*NU2*4*N_SRC2; ind_size = sizeof(int)*N2; @ where the memory is also allocated: <>= HANDLE_ERROR( cudaMalloc((void**)&d__psd, psd_size ) ); HANDLE_ERROR( cudaMalloc((void**)&d__alpha, 2*sizeof(float) ) ); HANDLE_ERROR( cudaMalloc((void**)&d__beta, 2*sizeof(float) ) ); HANDLE_ERROR( cudaMemcpy( d__alpha, alpha, 2*sizeof(float), cudaMemcpyHostToDevice) ); HANDLE_ERROR( cudaMemcpy( d__beta, beta, 2*sizeof(float), cudaMemcpyHostToDevice) ); HANDLE_ERROR( cudaMalloc((void**)&d__cov, cov_size ) ); INFO("@(CEO)>aaStats: Creating 2D covariance FFT plan\n"); int n_DFT[2] = {NF, NF}; int iodist = NF2; /* Create a 2D FFT plan. */ HANDLE_ERROR_CUFFT(cufftPlanMany(&plan, 2, n_DFT, NULL, 1, iodist, NULL, 1, iodist, CUFFT_C2C,BATCH), "Unable to create plan"); /* HANDLE_ERROR_CUFFT(cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE), */ /* "Unable to set compatibility mode to native"); */ @ The covariance is derived from Eq.~(\ref{eq:5}) for 2 sources in the directions $\vec\theta_1$ and $\vec\theta_2$ propagated through a [[N_LAYER]] atmospheric profile. It is given by \begin{equation} \label{eq:14} C_{\vec \alpha \cdot\ \vec \beta}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} C_{\vec \alpha \cdot\ \vec \beta}^l(\vec\rho_l) \end{equation} where \begin{equation} \label{eq:15} \vec \rho_l = \gamma_l\vec\rho + h_l(\vec\theta_2-\vec\theta_1) \end{equation} with $\gamma_l=1-h_l/z_\ast$, $h_l$ is the atmosphere layer altitude and $z_\ast$ is the sources height. Invoking the Wiener--Khinchine theorem, the translation and scaling properties of the Fourier transform and using Eq.~(\ref{eq:2}), the covariance for a single atmospheric layer at altitude $h_l$ is written: \begin{equation} \label{eq:16} C_{\vec \alpha \cdot\ \vec \beta}^{12,l}(\vec\rho_l) = \mathcal F^{-1} \left[ \lambda^2 \left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\gamma_l\vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right) \right] (\gamma_l\vec\rho) \end{equation} Setting \begin{equation} \label{eq:17} W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) = \lambda^2 \left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\gamma_l\vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right), \end{equation} Eq.~(\ref{eq:14}) is now given by \begin{equation} \label{eq:18} C_{\vec \alpha \cdot\ \vec \beta}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} \mathcal F^{-1} \left[ W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) \right] (\gamma_l\vec\rho). \end{equation} Calling $\delta=d/\kappa$ the sampling step of the vector $\vec\rho$ with the integer $\kappa\geq 1$, $\vec\rho_l$ is given by \begin{equation} \label{eq:19} \vec \rho_l = \gamma_l\delta(k\vec x + l\vec y),\quad\forall l,k\in \left[0,\cdots,[[NF]]-1\right], \end{equation} $\vec x$ and $\vec y$ are the unit vectors of the X and Y axis, respectively. Eq.~(\ref{eq:18}) becomes \begin{equation} \label{eq:20} C_{\vec \alpha \cdot\ \vec \beta}^{12}(k,l) = \sum_{l=1}^{[[N]]} \iint {\mathrm d^2}\vec f W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) \exp\left( 2\iota\pi \gamma_l \delta (k f_x + l f_y) \right) \end{equation} Discretizing Eq.~(\ref{eq:20}) with \begin{equation} \label{eq:21} \gamma_l\delta f_x = n/[[NF]],\quad \gamma_l\delta f_y = m/[[NF]],\quad \forall n,m\in \left[0,\cdots,[[NF]]-1\right], \end{equation} Eq.~(\ref{eq:20}) is transformed into \begin{eqnarray} \label{eq:22} C_{\vec \alpha \cdot\ \vec \beta}^{12}(k,l) &=& \sum_{l=1}^{[[N]]} \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\vec \alpha\cdot\vec \beta}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right) \\ &=& \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} W_{\vec \alpha\cdot\vec \beta}^{12}(n,m) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right) \end{eqnarray} with \begin{equation} \label{eq:23} W_{\vec \alpha\cdot\vec \beta}^{12}(n,m) = \sum_{l=1}^{[[N]]} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\vec \alpha\cdot\vec \beta}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \end{equation} <>= dim3 blockDimPsd(16,16); dim3 gridDimPsd( 1+NF/16 , 1+NF/16 ); dim3 blockDimCov(16,16); dim3 gridDimCov( 1+NU/16 , 1+NU/16 ); int i_SRC, j_SRC, offset0, offset; INFO("Covariance %dx%d block:\n",N_SRC,N_SRC); for (i_SRC=0;i_SRCwavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC, d__alpha, d__alpha, kappa); <> float2 *b; b = (float2*)malloc(psd_size); HANDLE_ERROR( cudaMemcpy( b, d__psd, psd_size, cudaMemcpyDeviceToHost) ); /* for (int i=0;i<100;i++) { INFO("|(%3d) [%+6.6g;%+6.6g]\n",i,b[i].x,b[i].y); } */ free(b); <> offset = offset0 + (2*j_SRC+1); //INFO(",%2d",offset); aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR ( d__psd, NF, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC, d__alpha, d__beta, kappa); <> <> offset = offset0 + 2*(N_SRC+j_SRC); //INFO(",%2d",offset); aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR ( d__psd, NF, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC, d__beta, d__alpha, kappa); <> <> offset = offset0 + (2*(N_SRC+j_SRC)+1); //INFO(",%2d)",offset); aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR ( d__psd, NF, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC, d__beta, d__beta, kappa); <> <> } INFO("\n"); } @ The Wiener-Khinchin theorem is applied on the power spectrum with the following: <>= HANDLE_ERROR_CUFFT(cufftExecC2C(plan, d__psd, d__psd, CUFFT_FORWARD), "Unable to execute plan"); //HANDLE_ERROR(cudaThreadSynchronize()); @ The power spectrum, given by Eq.~(\ref{eq:23}) and Eq.~(\ref{eq:17}), must be sampled such as its Fourier transform gives unbiased values of the covariance. The largest spatial frequency is $f_{max}=\kappa/2\gamma_l\delta$ with $\kappa\ge 1$. The numyber of sample is given by $[[N_F]]\ge [[N_SIDE_LENSLET]]$. The spatial frequencies are given by \begin{eqnarray} \label{eq:24} f_{x,y} &=& (i,j){2\over [[NF]] }{\kappa\over 2\gamma_l\delta}, (i,j) \in \left[ 0,\dots,[[NF]]/2-1\right] \\ f_{x,y} &=& (i,j){2\over [[NF]] }{\kappa\over 2\gamma_l\delta}, (i,j) \in \left[ [[NF]]/2,\dots,[[NF]]-1\right] \end{eqnarray} Note that [[NF]] must be even. <>= __global__ void aaPowerSpectrum(float2 *d__psd, int NF, float d, float wavelength, float r0, float L0, layer *turb, int N_LAYER, source *src1, source *src2, float *alpha, float *beta, float kappa) { int i, j, k; float fx, fy, f_square, fs, gl, red, c_red, s_red; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; k = i*NF + j; if ( (iheight); red = 2*PI*turb[k_LAYER].altitude; fs = (2.0/NF)*(kappa/(2*d*gl)); fx = (float) i; fy = (float) j; if (i>=(NF/2)) fx = fx - NF; if (j>=(NF/2)) fy = fy - NF; fx *= fs; fy *= fs; f_square = fx*fx + fy*fy; fs *= fs; red *= fx*(src2->theta_x - src1->theta_x) + fy*(src2->theta_y - src1->theta_y); sincosf( red, &s_red, &c_red); red = (k==0) ? 0 : turb[k_LAYER].xi0* fs*wavelength*(fx*alpha[0] + fy*alpha[1])*(fx*beta[0] + fy*beta[1])* powf(r0,-5.0/3.0)* phasePowerSpectrum(f_square, 1.0/L0)* gatePSF(gl*fx*d,gl*fy*d); d__psd[k].x += red*c_red; d__psd[k].y += red*s_red; } } } @ The covariance derived from the power spectrum is sampled every $\delta=d/\kappa$. The covariance derived from the slopes of the lenslet array is sampled every $d$. Consequently the lenslet--array--slope--covariance is extracted from the power--spectrum--covariance at $(i\kappa,j\kappa)$. Due to the symmetry of the Fourier transform output, the subscripts in the power--spectrum--covariance are in fact given by \begin{eqnarray} \label{eq:25} (i,j)\kappa, (i,j) \in \left[ 0,\dots,[[N]]-1 \right] \\ \left[(i,j)-[[NU]]\right]\kappa + [[NF]], (i,j) \in \left[ [[N]],\dots,[[NU]]-1 \right] \end{eqnarray} The lenslet--array--slope--covariance is shifted such as the baseline coordinate $(0,0$) is a the center of the array i.e. \begin{equation} \label{eq:26} (i,j) \leftarrow \left[ (i,j) + (2[[N]]-2)/2 \right] \mod ( 2[[N]]-1 ) \end{equation} <>= covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + offset*NU2, NU, d__psd, NF, kappa); @ The extraction of the covariance is done with the kernel: <>= __global__ void covariance_extraction(float *cov_out, int NC_out, float2 *cov_in, int NC_in, float kappa) { int i, j, k_out, k_in, h; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; if ( (i=((NC_out+1)/2)) i = kappa*(i - NC_out) + NC_in; else i *= kappa; if (j>=(NC_out+1)/2) j = kappa*(j - NC_out) + NC_in; else j *= kappa; k_in = i*NC_in + j; cov_out[k_out] = cov_in[k_in].x; } } @ The memory is freed with the following routine: \index{aaStats!aaStats!cleanup} <>= void aaStats::cleanup(void) { INFO("@(CEO)>aaStats: freeing memory!\n"); <> } <>= cufftDestroy(plan); HANDLE_ERROR( cudaFree( d__psd ) ); HANDLE_ERROR( cudaFree( d__alpha ) ); HANDLE_ERROR( cudaFree( d__beta ) ); HANDLE_ERROR( cudaFree( d__cov ) ); #ifdef AASTATS_DEBUG free(psd); free(cov); #endif @ \subsection{Input/Output} \label{sec:inputoutput} The main parameters of [[aaStats]] are displayed with the [[info]] routine: \index{aaStats!aaStats!info} <>= void aaStats::info(int kappa, float d) { INFO("\n\x1B[1;42m@(CEO)>aaStats:\x1B[;42m\n"); <> INFO("----------------------------------------------------\x1B[0m\n"); } <>= INFO(" . sampling [m] : %.2f\n",d); INFO(" . size of covariance matrix block: %dX%d\n",NU,NU); INFO(" . spectrum resolution : %d\n",NF); INFO(" . covariance oversampling factor : %d\n",kappa); INFO(" . spectrum highest frequency : %4.2E\n",(2.0/NF)*(kappa/(2*d))); @ The covariance can be written to a file with: \index{aaStats!aaStats!toFile} <>= void aaStats::toFile(char *filename) { dev2file(filename, d__cov, cov_size/sizeof(float)); } @ \subsection{Miscellaneous} \label{sec:miscellaneous} From the power spectrum, once can derive the variance: \index{aaStats!aaStats!variance} <>= float aaStats::variance(void) { float res; HANDLE_ERROR( cudaMemcpy( &res, d__cov + (N-1)*(NU+1), sizeof(float), cudaMemcpyDeviceToHost) ); return res; } @ \section{Phase and Angle--of--arrival covariance matrix functions} \label{sec:pa-functions} \index{aaStats!paStats} The covariance matrix is defined for a square lenslet array of size $N_L\times N_L$ and of pitch [[sampling]]. %The covariance sampling is set with the over--sampling factor [[osf]]. The covariance matrix depends on the atmosphere parameters and altitude profile. The covariance matrix is computed for vectors of [[N_P_SRC]] estimation sources and [[N_S_SRC]] guide star sources. The covariance matrix is written: \begin{eqnarray} \label{eq:27} \Xi = \left[ \begin{array}{ccc} \Xi_{11}&\cdots&\Xi_{1[[N__S_SRC]]} \\ \vdots &\ddots & \vdots \\ \Xi_{[[N_P_SRC]]1} & \vdots & \Xi_{[[N_P_SRC]][[N_S_SRC]]} \end{array} \right] \end{eqnarray} with \begin{eqnarray} \label{eq:28} \Xi_{ij} = \left[ \begin{array}{cc} \Xi_{{\phi_i}{x_j}} & \Xi_{{\phi_i}{y_j}} \end{array} \right]. \end{eqnarray} Each matrix $\Xi_{{\phi_i}{x_j}}$ is a two--level recursive block Toeplitz (2RBT) $[[M]]^2\times[[N]]^2$ matrix.% with $[[N]]=N_L$ if $[[osf]]=1$ or $[[N]]=[[osf]] \times N_L+1$ if $[[osf]]\ge 1$. Toeplitz matrices \cite{GolubVanLoan} are fully defined with their first row and column. So the matrix $\Xi_{{phi_i}{x_j}}$ is reduced to a $[[NU]]=([[M]][[N]]-1)^2$ elements vector $t_\Xi^{{\phi_i}{x_j}}$ and the matrix $\Xi_{ij}$ becomes the matrix \begin{equation} \label{eq:29} T_{ij} = \left[ t_\Xi^{{\phi_i}{x_j}} t_\Xi^{{\phi_i}{y_j}} \right]. \end{equation} Finally the Toeplitz compressed version of the matrix $\Xi$ is written: \begin{equation} \label{eq:30} T = \left[ t_\Xi^{{\phi_1}{x_1}} t_\Xi^{{\phi_1}{y_1}} \cdots t_\Xi^{{\phi_1}{x_[[N_S_SRC]]}} t_\Xi^{{\phi_1}{y_[[N_S_SRC]]}} \cdots t_\Xi^{{\phi_[[N_P_SRC]]}{x_1}} t_\Xi^{{\phi_[[N_P_SRC]]}{y_1}} \cdots t_\Xi^{{\phi_[[N_P_SRC]]}{x_[[N_S_SRC]]}} t_\Xi^{{\phi_[[N_P_SRC]]}{y_[[N_S_SRC]]}} \right]. \end{equation} @ \subsection{Setup \& Cleanup} \label{sec:setup--cleanup-1} \index{aaStats!paStats!setup} <>= void paStats::setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, source *phase_src_in, int N_P_SRC, const source *slopes_src, int N_S_SRC) { N_SRC2 = N_P_SRC*N_S_SRC; float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; osf = osf_; N = N_; M = M_; shift = (N==M) ? 0 : 1; N2 = M*N; NU = M+N-1; NU2 = NU*NU; NF = 4096; NF2 = NF*NF; kappa = 4; sampling = lenslet_pitch/osf; source *phase_src; if (phase_src_in[0].height==slopes_src[0].height) { phase_src = phase_src_in; M_LAYER = (int *)malloc(sizeof(int)); M_LAYER[0] = M; psd_size = sizeof(float2)*NF2; cov_size = sizeof(float)*NU2*2*N_SRC2; ind_size = sizeof(int)*N2; int BATCH = 1; <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"PA covariance evaluation"); } else { <> } info(kappa,sampling); } <>= void paStats::setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, const source *slopes_src, int N_S_SRC, float z_radius) { int N_P_SRC = 1; N_SRC2 = N_P_SRC*N_S_SRC; float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; osf = osf_; N = N_; M = M_; shift = (N==M) ? 0 : 1; N2 = M*N; NU = M+N-1; NU2 = NU*NU; NF = 4096; NF2 = NF*NF; kappa = 4; sampling = lenslet_pitch/osf; M_LAYER = (int *)malloc(sizeof(int)); M_LAYER[0] = M; psd_size = sizeof(float2)*NF2; cov_size = sizeof(float)*NU2*2*N_SRC2; ind_size = sizeof(int)*N2; int BATCH = 1; <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"PA covariance evaluation"); info(kappa,sampling); } @ The phase source is duplicated and set to the same height than the slope source: <>= float *zenith, *azimuth; zenith = (float*)malloc(sizeof(float)*N_P_SRC); azimuth = (float*)malloc(sizeof(float)*N_P_SRC); source __src; for (int k_P_SRC=0;k_P_SRCdev_ptr + k_P_SRC, sizeof(source), cudaMemcpyDeviceToHost ) ); zenith[k_P_SRC] = __src.zenith; azimuth[k_P_SRC] = __src.azimuth; } source clone_phase_src; clone_phase_src.setup(phase_src_in[0].photometric_band, zenith, azimuth, slopes_src[0].height,N_P_SRC,0); free(zenith); free(azimuth); @ The covariance is computed for each layers independently in the following: <>= /* source clone_phase_src; clone_phase_src.setup(phase_src_in[0].photometric_band, phase_src_in[0].zenith, phase_src_in[0].azimuth, slopes_src[0].height,0); */ <> phase_src = &clone_phase_src; float g; int o; NU2 = 0; M_LAYER = (int *)malloc(sizeof(int)*atm->N_LAYER); for (int k_LAYER=0;k_LAYERN_LAYER;k_LAYER++) { g = 1 - atm->turbulence.altitude[k_LAYER]/slopes_src[0].height; o = (int) ceilf( osf*0.5*N*(1-g)/g ); M_LAYER[k_LAYER] = M + o*2; NU = M_LAYER[k_LAYER]+N-1; NU2 += NU*NU; // printf("@(CEO)>paStats: layer #%d - NU=%d\n",k_LAYER,NU); } psd_size = sizeof(float2)*NF2; cov_size = sizeof(float)*NU2*2*N_SRC2; int BATCH = 1; <> stopwatch tid; tid.tic(); <> tid.toc(&cov_eval_et,"PA covariance evaluation"); clone_phase_src.cleanup(); <>= int covariance_sampling(float h, float z, int osf, int M, int N) { float g; int o, M_LAYER, NU; g = 1 - h/z; o = (int) ceilf( osf*0.5*N_SIDE_LENSLET*(1-g)/g ); M_LAYER = M + o*2; NU = M_LAYER+N-1; return NU*NU; } @ Memory is allocated with: <>= HANDLE_ERROR( cudaMalloc((void**)&d__psd, psd_size ) ); HANDLE_ERROR( cudaMalloc((void**)&d__alpha, 2*sizeof(float) ) ); HANDLE_ERROR( cudaMalloc((void**)&d__beta, 2*sizeof(float) ) ); HANDLE_ERROR( cudaMemcpy( d__alpha, alpha, 2*sizeof(float), cudaMemcpyHostToDevice) ); HANDLE_ERROR( cudaMemcpy( d__beta, beta, 2*sizeof(float), cudaMemcpyHostToDevice) ); HANDLE_ERROR( cudaMalloc((void**)&d__cov, cov_size ) ); INFO("\n@(CEO)>paStats: Creating 2D covariance FFT plan\n"); int n_DFT[2] = {NF, NF}; int iodist = NF2; /* Create a 2D FFT plan. */ HANDLE_ERROR_CUFFT( cufftPlanMany(&plan, 2, n_DFT, NULL, 1, iodist, NULL, 1, iodist, CUFFT_C2C,BATCH), "Unable to create plan" ); /* HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE), */ /* "Unable to set compatibility mode to native" ); */ @ The covariance is derived from Eq.~(\ref{eq:5}) for 2 sources in the directions $\vec\theta_1$ and $\vec\theta_2$ propagated through a [[N_LAYER]] atmospheric profile. It is given by \begin{equation} \label{eq:31} C_{\varphi \vec \alpha}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} C_{\varphi \vec \alpha}^l(\vec\rho_l) \end{equation} where \begin{equation} \label{eq:32} \vec \rho_l = \gamma_l\vec\rho + h_l(\vec\theta_2-\vec\theta_1) \end{equation} with $\gamma_l=1-h_l/z_\ast$, $h_l$ is the atmosphere layer altitude and $z_\ast$ is the sources height. Invoking the Wiener--Khinchine theorem, the translation and scaling properties of the Fourier transform and using Eq.~(\ref{eq:2}), the covariance for a single atmospheric layer at altitude $h_l$ is written: \begin{equation} \label{eq:33} C_{\varphi \vec \alpha}^{12,l}(\vec\rho_l) = \mathcal F^{-1} \left[ -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right) \right] (\gamma_l\vec\rho) \end{equation} Setting \begin{equation} \label{eq:34} W_{\varphi \vec \alpha}^{l,12}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right), \end{equation} Eq.~(\ref{eq:31}) is now given by \begin{equation} \label{eq:35} C_{\varphi \vec \alpha}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} \mathcal F^{-1} \left[ W_{\varphi \vec \alpha}^{l,12}(\vec f) \right] (\gamma_l\vec\rho). \end{equation} Calling $\delta=d/\kappa$ the sampling step of the vector $\vec\rho$ with the integer $\kappa\geq 1$, $\vec\rho_l$ is given by \begin{equation} \label{eq:36} \vec \rho_l = \gamma_l\delta(k\vec x + l\vec y),\quad\forall l,k\in \left[0,\cdots,[[NF]]-1\right], \end{equation} $\vec x$ and $\vec y$ are the unit vectors of the X and Y axis, respectively. Eq.~(\ref{eq:35}) becomes \begin{equation} \label{eq:37} C_{\varphi \vec \alpha}^{12}(k,l) = \sum_{l=1}^{[[N]]} \iint {\mathrm d^2}\vec f W_{\varphi \vec \alpha}^{l,12}(\vec f) \exp\left( 2\iota\pi \gamma_l \delta (k f_x + l f_y) \right) \end{equation} Discretizing Eq.~(\ref{eq:37}) with \begin{equation} \label{eq:38} \gamma_l\delta f_x = n/[[NF]],\quad \gamma_l\delta f_y = m/[[NF]],\quad \forall n,m\in \left[0,\cdots,[[NF]]-1\right], \end{equation} Eq.~(\ref{eq:37}) is transformed into \begin{eqnarray} \label{eq:39} C_{\varphi \vec \alpha}^{12}(k,l) &=& \sum_{l=1}^{[[N]]} \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\varphi \vec \alpha}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right) \\ &=& \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} W_{\varphi \vec \alpha}^{12}(n,m) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right) \end{eqnarray} with \begin{equation} \label{eq:40} W_{\varphi \vec \alpha}^{12}(n,m) = \sum_{l=1}^{[[N]]} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\varphi \vec \alpha}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \end{equation} %% Rectangular 2RBT <>= dim3 blockDimPsd(16,16); dim3 gridDimPsd( 1+NF/16 , 1+NF/16 ); dim3 blockDimCov(16,16); dim3 gridDimCov( 1+NU/16 , 1+NU/16 ); int i_P_SRC, j_S_SRC, offset0, offset; INFO("Covariance %dx%d block:\n",N_P_SRC,N_S_SRC); for (i_P_SRC=0;i_P_SRCwavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (phase_src->dev_ptr) + i_P_SRC, (slopes_src->dev_ptr) + j_S_SRC, d__alpha, kappa, shift); <> <> offset = offset0 + (2*j_S_SRC+1); //INFO(",%2d)",offset); paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (phase_src->dev_ptr) + i_P_SRC, (slopes_src->dev_ptr) + j_S_SRC, d__beta, kappa, shift); <> <> } INFO("\n"); } @ The power spectrum of the average of the covariance over a continous sample of sources within a circle of angular radius $\zeta$ is given by: \begin{equation} \label{eq:45} W_{\varphi \vec \alpha}^{l,\zeta 2}(\vec f) = {1\over \pi\zeta^2} \int_0^\zeta {\mathrm d}z z \int_0^{2\pi}{\mathrm d} \theta -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta) \right), \end{equation} Performing the integration leads to: \begin{equation} \label{eq:45} W_{\varphi \vec \alpha}^{l,\zeta 2}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) {2J_1(2\pi h_lf\zeta)\over 2\pi h_lf\zeta} \exp\left( 2\iota\pi h_l\vec f \cdot \vec\theta_2 \right), \end{equation} <>= dim3 blockDimPsd(16,16); dim3 gridDimPsd( 1+NF/16 , 1+NF/16 ); dim3 blockDimCov(16,16); dim3 gridDimCov( 1+NU/16 , 1+NU/16 ); int i_P_SRC, j_S_SRC, offset0, offset; INFO("Polar average covariance %dx%d block:\n",N_P_SRC,N_S_SRC); for (i_P_SRC=0;i_P_SRCwavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (slopes_src->dev_ptr) + j_S_SRC, d__alpha, kappa, shift, z_radius); <> <> offset = offset0 + (2*j_S_SRC+1); //INFO(",%2d)",offset); paPolarPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, (slopes_src->dev_ptr) + j_S_SRC, d__beta, kappa, shift, z_radius); <> <> } INFO("\n"); } @ The power spectrum is computed with the following kernels: <>= __global__ void paPowerSpectrum(float2 *d__psd, int NF, int osf, float d, float wavelength, float r0, float L0, layer *turb, int N_LAYER, source *src1, source *src2, float *alpha, float kappa, int shift) { int i, j, k; float fx, fy, f_square, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; k = i*NF + j; if ( (iheight); red = 2*PI; src_x = src2->theta_x - src1->theta_x; src_x *= turb[k_LAYER].altitude; src_y = src2->theta_y - src1->theta_y; src_y *= turb[k_LAYER].altitude; fs = osf*(2.0/NF)*(kappa/(2*d*gl)); fx = (float) i; fy = (float) j; if (i>=(NF/2)) fx = fx - NF; if (j>=(NF/2)) fy = fy - NF; fx *= fs; fy *= fs; f_square = fx*fx + fy*fy; fs *= fs; red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift); sincosf( red, &s_red, &c_red); red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])* powf(r0,-5.0/3.0)* phasePowerSpectrum(f_square, 1.0/L0)* gateAmp(gl*fx*d,gl*fy*d); d__psd[k].x += -red*s_red; d__psd[k].y += red*c_red; } } } @ The average of the power spectrum on an angular disc of radius $[z_radius]$ is <>= __global__ void paPolarPowerSpectrum(float2 *d__psd, int NF, int osf, float d, float wavelength, float r0, float L0, layer *turb, int N_LAYER, source *src2, float *alpha, float kappa, int shift, float z_radius) { int i, j, k; float fx, fy, f_square, f, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y, b; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; k = i*NF + j; if ( (itheta_x; src_x *= turb[k_LAYER].altitude; src_y = src2->theta_y; src_y *= turb[k_LAYER].altitude; fs = osf*(2.0/NF)*(kappa/(2*d*gl)); fx = (float) i; fy = (float) j; if (i>=(NF/2)) fx = fx - NF; if (j>=(NF/2)) fy = fy - NF; fx *= fs; fy *= fs; f = hypot(fx,fy); f_square = f*f; fs *= fs; b = 2*turb[k_LAYER].altitude*f*z_radius; red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift); sincosf( red, &s_red, &c_red); red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])* powf(r0,-5.0/3.0)* phasePowerSpectrum(f_square, 1.0/L0)* gateAmp(gl*fx*d,gl*fy*d)* somb(b); d__psd[k].x += -red*s_red; d__psd[k].y += red*c_red; } } } @ <>= pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + offset*NU2, NU, d__psd, NF, kappa); @ The extraction of the covariance is done with the kernel: <>= __global__ void pa_covariance_extraction(float *cov_out, int NC_out, float2 *cov_in, int NC_in, float kappa) { int i, j, k_out, k_in, h; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; if ( (i(NC_out/2)) i = kappa*(i - NC_out) + NC_in; else i *= kappa; if (j>(NC_out/2)) j = kappa*(j - NC_out) + NC_in; else j *= kappa; k_in = i*NC_in + j; cov_out[k_out] = cov_in[k_in].x; } } @ %% Rectangular 2RBT <>= dim3 blockDimPsd(16,16); dim3 gridDimPsd( 1+NF/16 , 1+NF/16 ); dim3 blockDimCov(16,16); dim3 gridDimCov( 1+NU/16 , 1+NU/16 ); int i_P_SRC, j_S_SRC, k_LAYER; INFO("Covariance %dx%d block:\n",N_P_SRC*atm->N_LAYER,N_S_SRC); int CNU2 = 0; for (i_P_SRC=0;i_P_SRCN_LAYER;k_LAYER++) { NU = M_LAYER[k_LAYER]+N-1; for (j_S_SRC=0;j_S_SRCwavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, k_LAYER, (phase_src->dev_ptr) + i_P_SRC, (slopes_src->dev_ptr) + j_S_SRC, d__alpha, kappa,shift); <> pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + CNU2, NU, d__psd, NF, kappa); CNU2 += NU*NU; //INFO(",%2d)",CNU2); paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR ( d__psd, NF, osf, lenslet_pitch, atm->wavelength, atm->r0, atm->turbulence.L0, atm->d__layers, atm->N_LAYER, k_LAYER, (phase_src->dev_ptr) + i_P_SRC, (slopes_src->dev_ptr) + j_S_SRC, d__beta, kappa,shift); <> pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + CNU2, NU, d__psd, NF, kappa); CNU2 += NU*NU; } INFO("\n"); } } <>= __global__ void paPowerSpectrum(float2 *d__psd, int NF, int osf, float d, float wavelength, float r0, float L0, layer *turb, int N_LAYER, int k_LAYER, source *src1, source *src2, float *alpha, float kappa, int shift) { int i, j, k; float fx, fy, f_square, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; k = i*NF + j; if ( (iheight); red = 2*PI; src_x = src2->theta_x - src1->theta_x; src_x *= turb[k_LAYER].altitude; src_y = src2->theta_y - src1->theta_y; src_y *= turb[k_LAYER].altitude; fs = osf*(2.0/NF)*(kappa/(2*d*gl)); fx = (float) i; fy = (float) j; if (i>=(NF/2)) fx = fx - NF; if (j>=(NF/2)) fy = fy - NF; fx *= fs; fy *= fs; f_square = fx*fx + fy*fy; fs *= fs; red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift); sincosf( red, &s_red, &c_red); red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])* powf(r0,-5.0/3.0)* phasePowerSpectrum(f_square, 1.0/L0)* gateAmp(gl*fx*d,gl*fy*d); d__psd[k].x += -red*s_red; d__psd[k].y += red*c_red; } } @ The memory is freed with the following routine: \index{aaStats!paStats!cleanup} <>= void paStats::cleanup(void) { INFO("@(CEO)>paStats: freeing memory!\n"); free(M_LAYER); <> } @ \subsection{Input/Output} \label{sec:inputoutput-1} The main parameters of [[paStats]] are displayed with the info routine: \index{aaStats!paStats!info} <>= void paStats::info(int kappa, float d) { INFO("\n\x1B[1;42m@(CEO)>paStats:\x1B[;42m\n"); <> INFO(" . phase oversampling factor : %d\n",osf); INFO("----------------------------------------------------\x1B[0m\n"); } @ The [[paStats]] covariance can be written to a file with: \index{aaStats!paStats!toFile} <>= void paStats::toFile(char *filename) { dev2file(filename, d__cov, cov_size/sizeof(float)); } @ \subsection{Miscellaneous} \label{sec:miscellaneous-1} The phase power spectrum is given by: <>= __device__ float phasePowerSpectrum(float f_square, float f0) { return 0.022895587108555*powf(f_square + f0*f0,-11.0/6.0); } @ The subaperture Fourier transform is given by \begin{equation} \label{eq:41} {\mathrm sinc}(f_x){\mathrm sinc}(f_y) \end{equation} and written in <>= __device__ float gateAmp(float fx, float fy) { float out; out = sinc(fx)*sinc(fy); return out; } @ The subaperture PSF is given by \begin{equation} \label{eq:42} {\mathrm sinc}^2(f_x){\mathrm sinc}^2(f_y) \end{equation} and written in <>= __device__ float gatePSF(float fx, float fy) { float out; out = sinc(fx)*sinc(fy); return out*out; } @ that depends on the $\mathrm sinc$ function: \begin{equation} \label{eq:43} {\sin(\pi x)\over \pi x} \end{equation} <>= __device__ float sinc(float x) { return (x==0) ? 1.0 : sinf( PI*x) / (PI*x) ; } @ The Sombrero function is defined as: \begin{equation} \label{eq:44} {\mathrm somb}(x) = {2J_1(x)\over x} \end{equation} <>= __device__ float somb(float x) { return (x==0) ? 1.0 : 2 * j1(PI*x) / (PI*x); } @ \section{Tests} \label{sec:tests} The test routine is written: <>= #include #include #include using namespace std; #include "cublas_v2.h" #include "h" #include "aaStats.h" #include "BTBT.h" using namespace std; #define N_RUN 5 #define N_SAMPLE 100 __global__ void fill(float *A, int m_a, int n_a) { int i, j, k; float n; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; k = i*m_a +j; n = (float) m_a*n_a; if (k>> ERROR! CUBLAS FAILED! <<<\n"); HANDLE_ERROR( cudaFree( d__A ) ); } else aa_full_mvm_et[k_N]=0; aa.cleanup(); C.cleanup(); HANDLE_ERROR( cudaFree( d__x ) ); HANDLE_ERROR( cudaFree( d__y ) ); free(x); free(y); } for (k_N=0;k_N>> ERROR! CUBLAS FAILED! <<<\n"); HANDLE_ERROR( cudaFree( d__A ) ); } else pa_full_mvm_et[k_N]=0; pa.cleanup(); S.cleanup(); HANDLE_ERROR( cudaFree( d__x ) ); HANDLE_ERROR( cudaFree( d__y ) ); free(x); free(y); } for (k_N=0;k_N>= kern = parallel.gpu.CUDAKernel('aaStats.ptx','aaStats.cu','MVM_kernel'); kern.ThreadBlockSize = [16,16]; kern.GridSize = [2,2]; d1 = 2/3; d2 = 2/3; NU = 7; N1 = 4; N2 = 4; g1 = 1; g2 = 1; [x,y] = meshgrid( linspace(-1,1,4) ); z = x(:) + 1i*y(:); c = abs( bsxfun( @minus, z , z.') ); idx = [13 9 5 1 2 3 4]; cs = mat2cell( c , ones(1,4)*4,ones(1,4)*4); cov = cell2mat(cellfun( @(x) x(idx)', cs(idx) , 'UniformOutput', false)); vout = gpuArray.zeros(16^2,1,'single'); vin = gpuArray.ones(16^2,1,'single'); sampling = 2/3;