% -*- mode: Noweb; noweb-code-mode: c-mode -*- @ \index{iterativeSolvers} \section{The files} \subsection{Header} <>= #ifndef __ITERATIVESOLVERS_H__ #define __ITERATIVESOLVERS_H__ #ifndef __UTILITIES_H__ #include "utilities.h" #endif #ifndef __BTBT_H__ #include "BTBT.h" #endif //#define MINRES_DEBUG struct iterativeSolvers { <> void cg_setup(int n_vector); void pcg_setup(int n_vector); void minres_setup(int n_vector); void cleanup(void); void cg(float *x, BTBT* A, float *b, int max_it, float* x0); void pcg(float *x, BTBT* A, float *b, int max_it, float* x0, float ip); void sym_ortho(float *c,float *s,float *r, float *a,float *b); void lanczos_step(float *alpha, float *beta, float *nu_kp1, BTBT *A, float *nu_k, float *nu_km1, float sigma); void minres_vorst(float *x, BTBT* A, float *b, float* x0); void minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0); void minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0); void pminres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0, float ip); void minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0); void minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0); // void info(void); }; #endif // __ITERATIVESOLVERS_H__ @ \subsection{Source} <>= #include "iterativeSolvers.h" <> <> <> <> <> <> <> <> <> <> <> <> <> @ \section{Parameters} \index{iterativeSolvers!iterativeSolvers} The structure parameters are <>= float *d__vectors, *q, *x, *r, *p, *z, *nu_i, *nu_im1, *nu_ip1, *w_i, *w_im1, *w_im2; float rnorm, rel_rnorm, mean_time_per_iteration, RTOL, ATOL; int N, N_ITERATION, cvgce_iteration; stopwatch tid; cublasHandle_t handle; cublasStatus_t status; char VERBOSE; @ \section{Functions} \subsection{Setup \& Cleanup} The iterativeSolvers structure gather several routines implementing iterative methods to solve the linear system: $Ax=b$. The setup method is allocated the required memory depending of the iterative method type: \begin{itemize} \item \index{iterativeSolvers!iterativeSolvers!cg\_setup} <>= void iterativeSolvers::cg_setup(int n_vector) { INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n"); N = n_vector; HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*4 ) ); q = d__vectors; x = d__vectors + N; r = d__vectors + 2*N; p = d__vectors + 3*N; cublasCreate(&handle); } <>= void iterativeSolvers::pcg_setup(int n_vector) { INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n"); N = n_vector; HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*5 ) ); q = d__vectors; x = d__vectors + N; r = d__vectors + 2*N; p = d__vectors + 3*N; z = d__vectors + 4*N; cublasCreate(&handle); } @ \item \index{iterativeSolvers!iterativeSolvers!minres\_setup} <>= void iterativeSolvers::minres_setup(int n_vector) { INFO("@(CEO)>iterativeSolvers: MINRES\n"); N = n_vector; HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*6 ) ); nu_i = d__vectors; nu_im1 = d__vectors + N; nu_ip1 = d__vectors + 2*N; w_i = d__vectors + 3*N; w_im1 = d__vectors + 4*N; w_im2 = d__vectors + 5*N; cublasCreate(&handle); VERBOSE = 0; ATOL = 0; RTOL = 5E-2; N_ITERATION = 100; } @ \end{itemize} Memory is freed with \index{iterativeSolvers!iterativeSolvers!cleanup} <>= void iterativeSolvers::cleanup(void) { INFO("@(CEO)>iterativeSolvers: freeing memory!\n"); HANDLE_ERROR( cudaFree( d__vectors ) ); cublasDestroy(handle); } @ \subsection{Conjugate gradient} \label{sec:conjugate-gradient} The next routine implements the conjugate gradient iterative method: \index{iterativeSolvers!iterativeSolvers!cg} <>= void iterativeSolvers::cg(float *x, BTBT* A, float *b, int max_it, float* x0) { float alpha, beta, gamma; int k; <> tid.tic(); for (k=0;k> if (k<(max_it-1)) { <> } } tid.toc(&mean_time_per_iteration); mean_time_per_iteration /= max_it; } @ The conjugate gradient algorithm is written: \begin{itemize} \item initialization: \begin{enumerate} \item $r=Ax_0$ <>= A->MVM(r , x0); @ \item $r_0 = b - r_0$ <>= beta = -1; cublasSscal(handle, N, &beta, r, 1); alpha = 1; cublasSaxpy(handle, N, &alpha, b, 1, r, 1); @ \item $p_0=r_0$ <>= cublasScopy(handle, N, r, 1, p, 1); @ \end{enumerate} \item loop: \begin{enumerate} \item $q=Ap_k$ <>= A->MVM(q , p); @ \item $\gamma=r_k^Tr_k$ <>= cublasSdot(handle, N, r, 1, r, 1, &gamma); //printf("r norm=%6.2E\n",sqrt(gamma)); @ \item $\left\| r \right\|_2=\sqrt\gamma$ <>= rnorm = sqrt(gamma); if (gamma==0) break; @ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$ <>= cublasSdot(handle, N, p, 1, q, 1, &alpha); alpha = gamma/alpha; @ \item $x_{k+1} = x_k + \alpha_kp_k$ <>= cublasSaxpy(handle, N, &alpha, p, 1, x, 1); @ \item $r_{k+1} = r_k - \alpha_kq$ <>= alpha = -alpha; cublasSaxpy(handle, N, &alpha, q, 1, r, 1); @ \item $\beta_k = {\displaystyle r_{k+1}^Tr_{k+1} \over \gamma} $ <>= cublasSdot(handle, N, r, 1, r, 1, &beta); beta = beta/gamma; @ \item $p_{k+1} = r_{k+1} + \beta_k p_k$ <>= cublasSscal(handle, N, &beta, p, 1); alpha = 1; cublasSaxpy(handle, N, &alpha, r, 1, p, 1); @ \end{enumerate} \end{itemize} \subsection{Preconditioned conjugate gradient} \label{sec:precond-conjugate-gradient} The next routine implements the conjugate gradient iterative method with Jacobi preconditioner. The Jacobi preconditioner $M$ is given by \begin{equation} \label{eq:1} M = \operatorname{diag}\left( A \right) = \sigma_A^2 I, \end{equation} and its inverse is simply \begin{equation} \label{eq:2} M^{-1} = \sigma_A^{-2} I = [[ip]] I. \end{equation} \index{iterativeSolvers!iterativeSolvers!pcg} <>= void iterativeSolvers::pcg(float *x, BTBT* A, float *b, int max_it, float* x0, float ip) { float alpha, beta, gamma; int k; <> tid.tic(); for (k=0;k> if (k<(max_it-1)) { <> } } tid.toc(&mean_time_per_iteration); mean_time_per_iteration /= max_it; } @ The preconditioned conjugate gradient algorithm is written: \begin{itemize} \item initialization: \begin{enumerate} \item $r=Ax_0$ <>= A->MVM(r , x0); @ \item $r_0 = b - r_0$ <>= beta = -1; cublasSscal(handle, N, &beta, r, 1); alpha = 1; cublasSaxpy(handle, N, &alpha, b, 1, r, 1); @ \item $z_0=M^{-1}r_0$ <>= cublasScopy(handle, N, r, 1, z, 1); cublasSscal(handle, N, &ip, z, 1); @ \item $p_0=z_0$ <>= cublasScopy(handle, N, z, 1, p, 1); @ \end{enumerate} \item loop: \begin{enumerate} \item $q=Ap_k$ <>= A->MVM(q , p); @ \item $\gamma=r_k^Tz_k$ <>= cublasSdot(handle, N, r, 1, z, 1, &gamma); //printf("r norm=%6.2E\n",sqrt(gamma)); @ \item $\left\| r \right\|_2=\sqrt\gamma$ <>= rnorm = sqrt(gamma); if (gamma==0) break; @ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$ <>= cublasSdot(handle, N, p, 1, q, 1, &alpha); alpha = gamma/alpha; @ \item $x_{k+1} = x_k + \alpha_kp_k$ <>= cublasSaxpy(handle, N, &alpha, p, 1, x, 1); @ \item $r_{k+1} = r_k - \alpha_kq$ <>= alpha = -alpha; cublasSaxpy(handle, N, &alpha, q, 1, r, 1); @ \item $z_{k+1} = M^{-1}r_{k+1}$ <>= cublasScopy(handle, N, r, 1, z, 1); cublasSscal(handle, N, &ip, z, 1); @ \item $\beta_k = {\displaystyle r_{k+1}^Tz_{k+1} \over \gamma} $ <>= cublasSdot(handle, N, r, 1, z, 1, &beta); beta = beta/gamma; @ \item $p_{k+1} = z_{k+1} + \beta_k p_k$ <>= cublasSscal(handle, N, &beta, p, 1); alpha = 1; cublasSaxpy(handle, N, &alpha, z, 1, p, 1); @ \end{enumerate} \end{itemize} \subsection{MINRES} \label{sec:minres} \subsubsection{Vorst implementation} \label{sec:vorst-implementation} The MINRES iterative solver~\cite{PS75,Vorst03,Choi06} method is implemented in the next routine: \index{iterativeSolvers!iterativeSolvers!minres\_vorst} <>= void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float* x0) { float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1, eta, delta, rho1, rho2, rho3, rnorm0; int k; <> // tid.tic(); for (k=0;k> } // tid.toc(&mean_time_per_iteration); // mean_time_per_iteration /= max_it; cvgce_iteration = k; INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n", k,rel_rnorm); } <>= void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0) { float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1, eta, delta, rho1, rho2, rho3, rnorm0; int k; <> // tid.tic(); for (k=0;k> } // tid.toc(&mean_time_per_iteration); // mean_time_per_iteration /= max_it; INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n", k,rel_rnorm); } <>= void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0) { float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1, eta, delta, rho1, rho2, rho3, rnorm0; int k; <> RTOL = rtol; // tid.tic(); for (k=0;k> } // tid.toc(&mean_time_per_iteration); // mean_time_per_iteration /= max_it; INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n", k,rel_rnorm); } <>= void iterativeSolvers::minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0) { float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1, eta, delta, rho1, rho2, rho3, res_cst, result, rnorm0; int k; float *residual; HANDLE_ERROR( cudaMalloc( (void**)&residual, sizeof(float)*N ) ); <> tid.tic(); for (k=0;k> A->MVM(residual , x); res_cst = -1; cublasSscal(handle, N, &res_cst, residual, 1); res_cst = 1; cublasSaxpy(handle, N, &res_cst, b, 1, residual, 1); cublasSnrm2(handle, N, residual, 1, &result); res[k] = result; INFO("(%3d): Residue norm: %.2E\n",k,result); } tid.toc(&mean_time_per_iteration); mean_time_per_iteration /= max_it; HANDLE_ERROR( cudaFree( residual) ); } @ The MINRES algorithm is written: \begin{itemize} \item initialization: \begin{enumerate} \item $r=Ax_0$ <>= A->MVM(nu_i , x0); @ \item $r_0 = b - r_0$ <>= beta = -1; cublasSscal(handle, N, &beta, nu_i, 1); alpha = 1; cublasSaxpy(handle, N, &alpha, b, 1, nu_i, 1); #ifdef MINRES_DEBUG cublasSnrm2(handle, N, b, 1, &beta); INFO("beta=%.2E\n",beta); #endif @ \item $\beta_1=\left\| \nu_1 \right\|_2=\left\| r \right\|_2$ <>= //cublasSnrm2(handle, N, nu_i, 1, &beta); cublasSdot(handle, N, nu_i, 1, nu_i, 1, &beta); beta = sqrtf(beta); rnorm = beta; //rnorm0 = rnorm; cublasSnrm2(handle, N, b, 1, &rnorm0); rel_rnorm = rnorm/rnorm0; if (VERBOSE) INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",0,rnorm,rnorm/rnorm0); if ( (rnorm>= eta = beta; gamma_i = gamma_im1 = 1; sigma_i = sigma_im1 = 0; @ \item $\nu_0=0$, $w_0=w_{-1}=0$ <>= HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) ); @ \end{enumerate} \item loop: \begin{enumerate} \item $\nu_i = \nu_i / \beta_i$ <>= rho1 = 1/beta; cublasSscal(handle, N, &rho1, nu_i, 1); @ \item $\nu_{i+1}= A\nu_i$ <>= A->MVM(nu_ip1 , nu_i); @ \item $\alpha_i = \nu_i^T\nu_{i+1}$ <>= cublasSdot(handle, N, nu_i, 1, nu_ip1, 1, &alpha); #ifdef MINRES_DEBUG INFO("alpha=%.2E\n",alpha); #endif @ \item $\nu_{i+1} = \nu_{i+1} - \alpha_i\nu_i - \beta_i\nu_{i-1}$ <>= alpha = -alpha; beta = -beta; cublasSaxpy(handle, N, &alpha, nu_i, 1, nu_ip1, 1); cublasSaxpy(handle, N, &beta, nu_im1, 1, nu_ip1, 1); alpha = -alpha; beta = -beta; @ \item $\delta = \gamma_i\alpha_i - \gamma_{i-1}\sigma_i\beta_i$ <>= delta = gamma_i*alpha - gamma_im1*sigma_i*beta; #ifdef MINRES_DEBUG INFO("delta=%.2E\n",delta); #endif @ \item $\rho_2 = \sigma_i\alpha_i + \gamma_{i-1}\gamma_i\beta_i$ <>= rho2 = sigma_i*alpha + gamma_im1*gamma_i*beta; #ifdef MINRES_DEBUG INFO("rho2=%.2E\n",rho2); #endif @ \item $\rho_3 = \sigma_{i-1}\beta_i$ <>= rho3 = sigma_im1*beta; #ifdef MINRES_DEBUG INFO("rho3=%.2E\n",rho3); #endif @ \item $\beta_{i+1}=\left\| \nu_{i+1} \right\|_2$ <>= //cublasSnrm2(handle, N, nu_ip1, 1, &beta); cublasSdot(handle, N, nu_ip1, 1, nu_ip1, 1, &beta); beta = sqrtf(beta); #ifdef MINRES_DEBUG INFO("beta=%.2E\n",beta); #endif @ \item $\rho_1 = \sqrt{\delta^2+\beta_{i+1}^2}$ <>= rho1 = hypotf(delta,beta); #ifdef MINRES_DEBUG INFO("rho1=%.2E\n",rho1); #endif @ \item $\gamma_{i+1} = \delta / \rho_1$ <>= gamma_im1 = gamma_i; gamma_i = delta/rho1; #ifdef MINRES_DEBUG INFO("gamma_im1=%.2E\n",gamma_im1); INFO("gamma_i=%.2E\n",gamma_i); #endif @ \item $\sigma_{i+1} = \beta_{i+1} / \rho_1$ <>= sigma_im1 = sigma_i; sigma_i = beta/rho1; #ifdef MINRES_DEBUG INFO("sigma_im1=%.2E\n",sigma_im1); INFO("sigma_i=%.2E\n",sigma_i); #endif @ \item $w_i = (\nu_i-\rho_3w_{i-2}-\rho_2w_{i-1})/\rho_1$ <>= cublasScopy(handle, N, nu_i, 1, w_i, 1); rho1 = 1.0/rho1; cublasSscal(handle, N, &rho1, w_i, 1); rho3 = -rho3*rho1; cublasSaxpy(handle, N, &rho3, w_im2, 1, w_i, 1); rho2 = -rho2*rho1; cublasSaxpy(handle, N, &rho2, w_im1, 1, w_i, 1); @ \item $x_i = x_{i-1} + \gamma_{i+1}\eta w_i$ <>= rho1 = gamma_i*eta; cublasSaxpy(handle, N, &rho1, w_i, 1, x, 1); @ \item $\left\| r_i \right\|_2 = \left| \sigma_{i+1} \right|\left\| r_{i-1} \right\|_2 $ <>= rnorm = rnorm*fabs(sigma_i); rel_rnorm = rnorm/rnorm0; if (VERBOSE) INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",k,rnorm,rnorm/rnorm0); if ( (rnorm>= eta = -sigma_i*eta; #ifdef MINRES_DEBUG INFO("eta=%.2E\n",eta); #endif @ \item \begin{eqnarray} \label{eq:3} \nu_{i-1} &\leftarrow& \nu_i \nonumber\\ \nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\ w_{i-2} &\leftarrow& w_{i-1} \nonumber\\ w_{i-1} &\leftarrow& \nu_{i} \nonumber \end{eqnarray} <>= cublasScopy(handle, N, nu_i , 1, nu_im1, 1); cublasScopy(handle, N, nu_ip1, 1, nu_i , 1); cublasScopy(handle, N, w_im1 , 1, w_im2 , 1); cublasScopy(handle, N, w_i , 1, w_im1 , 1); if (beta==0) break; @ \end{enumerate} \end{itemize} \subsubsection{Choi implementation} \label{sec:choi-implementation} \index{iterativeSolvers!iterativeSolvers!minres\_choi} <>= void iterativeSolvers::minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0) { float a, alpha, beta, gamma, delta1, delta2, phi, tau, c, s, epsilon; int k; <> tid.tic(); for (k=0;k> } tid.toc(&mean_time_per_iteration); mean_time_per_iteration /= max_it; } @ The MINRES algorithm is written: \begin{itemize} \item initialization: \begin{enumerate} \item $\beta_1=\left\| b \right\|_2$ <>= cublasSnrm2(handle, N, b, 1, &beta); @ \item $\nu_0 = 0$ <>= HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) ); @ \item $\nu_1 = b/\beta_1$ <>= HANDLE_ERROR( cudaMemset(nu_i, 0, sizeof(float)*N ) ); a = 1/beta; cublasSaxpy(handle, N, &a, b, 1, nu_i, 1); @ \item $\phi=\tau=\beta_1$, $\delta^{(1)}_1=0$, $c_0=-1$, $s_0=0$ <>= phi= tau = beta; delta1 = 0; c = -1; s = 0; @ \item $d_0=d_{-1}=0$ <>= HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) ); @ \end{enumerate} \item loop: \begin{enumerate} \item Lanczos step <>= lanczos_step(&alpha, &beta, nu_ip1, A, nu_i, nu_im1, 0); @ \item $\delta^{(2)}_k=c_{k-1}\delta^{(1)}_k + s_{k-1}\alpha_k$ <>= delta2 = c*delta1 + s*alpha; @ \item $\gamma^{(1)}_k=s_{k-1}\delta^{(1)}_k - c_{k-1}\alpha_k$ <>= gamma = s*delta1 - c*alpha; @ \item $\epsilon_k = s_{k-1}\beta_{k+1}$ <>= epsilon = s*beta; @ \item $\delta^{(1)}_k = -c_{k-1}\beta_{k+1}$ <>= delta1 = -c*beta; @ \item sym--ortho <>= a = gamma; sym_ortho(&c, &s, &gamma, &a, &beta); @ \item $\tau_k = c_k\phi_k$ <>= tau = c*phi; @ \item $\phi_k = s_k\phi_{k-1}$ <>= phi = s*phi; //INFO("r norm=%6.2E\n",phi); @ \item $\left\| r \right\|_2 = \phi_k$ <>= rnorm = phi; @ \item $d_k = (\nu_k-\delta^{(2)}_k d_{k-1}-\epsilon_k d_{k-2})/\gamma_k$ <>= if (gamma==0) break; cublasScopy(handle, N, nu_i, 1, w_i, 1); a = 1/gamma; cublasSscal(handle, N, &a, w_i, 1); a = -delta2/gamma; cublasSaxpy(handle, N, &a, w_im1, 1, w_i, 1); a = -epsilon/gamma; cublasSaxpy(handle, N, &a, w_im2, 1, w_i, 1); @ \item $x_k = x_{k-1} + \tau d_k$ <>= cublasSaxpy(handle, N, &tau, w_i, 1, x, 1); @ \item \begin{eqnarray} \label{eq:1a} \nu_{i-1} &\leftarrow& \nu_i \nonumber\\ \nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\ d_{i-2} &\leftarrow& d_{i-1} \nonumber\\ d_{i-1} &\leftarrow& d_{i} \nonumber \end{eqnarray} <>= cublasScopy(handle, N, nu_i , 1, nu_im1, 1); cublasScopy(handle, N, nu_ip1, 1, nu_i , 1); cublasScopy(handle, N, w_im1 , 1, w_im2 , 1); cublasScopy(handle, N, w_i , 1, w_im1 , 1); @ \end{enumerate} \end{itemize} @ The sym--ortho function is: \index{iterativeSolvers!iterativeSolvers!sym\_ortho} <>= void iterativeSolvers::sym_ortho(float *c,float *s,float *r,float *a,float *b) { float abs_a, abs_b, tau; abs_a = fabs(*a); abs_b = fabs(*b); if (*b==0) { *s = 0; *r = abs_a; *c = (*a==0) ? 1 : sign(*a); } else if (*a==0) { *c = 0; *s = sign(*b); *r = abs_b; } else if (abs_b>abs_a) { tau = *a/(*b); *s = sign(*b)/sqrtf(1+tau*tau); *c = *s*tau; *r = *b/(*s); } else if (abs_a>abs_b) { tau = *b/(*a); *c = sign(*a)/sqrtf(1+tau*tau); *s = *c*tau; *r = *a/(*c); } } @ The Lanzcos step is: \index{iterativeSolvers!iterativeSolvers!lanczos\_step} <>= void iterativeSolvers::lanczos_step(float *alpha, float *beta, float *nu_kp1, BTBT *A, float *nu_k, float *nu_km1, float sigma) { float a; A->MVM(nu_kp1, nu_k); if (sigma!=0) { a = -sigma; cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1); } cublasSdot(handle, N, nu_k, 1, nu_kp1, 1, alpha); a = -(*alpha); cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1); a = -(*beta); cublasSaxpy(handle, N, &a, nu_km1, 1, nu_kp1, 1); cublasSnrm2(handle, N, nu_kp1, 1, beta); if (beta!=0) { a = 1/(*beta); cublasSscal(handle, N, &a, nu_kp1, 1); } } @ \section{Tests} \label{sec:tests} The test routine is: <>= #include "cuda_profiler_api.h" #ifndef __CEO_H__ #include "ceo.h" #endif //#define N 10 <> __global__ void fill(float *data, int n_data, float value) { int k = blockIdx.x * blockDim.x + threadIdx.x; if (k> } @ In the following a simple test is performed: <>= /* The test requires to replace the MVM with MVM (Test 1) in the code chunk BTBT.cu in the file BTBT.nw */ float *d__x, *d__b; HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) ); float b[N] = {1,2,3,4,5,6,7,8,9,10}; HANDLE_ERROR( cudaMemcpy( d__b, b, N*sizeof(float), cudaMemcpyHostToDevice) ); BTBT A; A.setup(N); iterativeSolvers iSolve; /* iSolve.cg_setup(N); */ /* iSolve.cg(d__x, &A, d__b, 5, d__x); */ iSolve.minres_setup(N); iSolve.minres(d__x, &A, d__b, 5, d__x); iSolve.cleanup(); float *x; x = (float *)malloc(sizeof(float)*N); HANDLE_ERROR( cudaMemcpy( x, d__x, N*sizeof(float), cudaMemcpyDeviceToHost) ); for (int k=0;k>= /* The test requires to replace the MVM with MVM (Test 2) in the code chunk BTBT.cu in the file BTBT.nw */ float *x; x = (float *)malloc(sizeof(float)*N); float *d__x, *d__b; HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) ); for (int k=0;k>= <> // r0 = d; <> <> <> @ In the following, all the variables are declared and set to their default or user given values. <>= int N = _N_LENSLET_*2, NP, PS_E_N_PX, k, osf; float *d__x, *d__phase_est, *phase_screen_est; float D, d, delta, delta_e, r0=0.15; char *d__mask; if (argc != 3) { INFO("Usage: %s D CG or MINRES\n",argv[0]); return 1; } char solver[10]; sprintf(solver,"%s",argv[2]); printf("\n___ SOLVER: %s ___\n",solver); D = atof(argv[1]); d = D/N_SIDE_LENSLET; delta = d/_N_PX_PUPIL_; osf = 1; delta_e = d/osf; NP = osf*N_SIDE_LENSLET+1; PS_E_N_PX = NP; PS_E_N_PX *= PS_E_N_PX; printf("\nd =%.4f\n",d); printf("\ndelta=%.4f\n",delta); printf("\ndelta_e=%.4f\n",delta_e); source src, gs; atmosphere atm; mask pupil_mask; imaging lenslet_array; centroiding cog; aaStats aa; BTBT aaCov; paStats pa; BTBT paCov; iterativeSolvers iSolve; stats S; S.setup(); stopwatch tid; @ Then all the components are initialized and the dynamic arrays are allocated. <>= src.setup(ARCSEC(0) , 0, INFINITY, PS_E_N_PX); gs.setup(ARCSEC(0) , 0, 90e3, PS_E_N_PX); // Atmosphere // Single layer turbulence profile float altitude[] = {0}, xi0[] = {1}, wind_speed[] = {10}, wind_direction[] = {0}; //atm.setup(r0,30,altitude,xi0,wind_speed,wind_direction); // GMT 7 layers turbulence profile atm.gmt_setup(r0,30); //atm.reset(); pupil_mask.setup( (N_SIDE_LENSLET+1)*(N_SIDE_LENSLET+1) ); // SH WFS lenslet_array.setup(_N_PX_PUPIL_,N_SIDE_LENSLET,2,1,1); // Centroid cog.setup(); // covariances aa.setup(N_SIDE_LENSLET,&atm,d, &gs, 1); aa.toFile("aaCovariance.bin"); printf("\n AA variance [rd^2]: %.3E\n",aa.variance()); aaCov.setup(2,2,N_SIDE_LENSLET,aa.d__cov); pa.setup(NP,N_SIDE_LENSLET,osf,&atm,d,&src,1,&gs,1); pa.toFile("paCovariance.bin"); paCov.setup(1,2,NP,N_SIDE_LENSLET,pa.d__cov); HANDLE_ERROR( cudaMalloc((void**)&d__mask, sizeof(char)*_N_LENSLET_ ) ); dim3 blockDim_vl(16,16); dim3 gridDim_vl(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1); valid_lenslet LLL gridDim_vl,blockDim_vl RRR (d__mask,N_SIDE_LENSLET*_N_PX_PUPIL_,0.5); /* char *L_mask, *A_mask; */ /* HANDLE_ERROR( cudaMalloc((void**)&L_mask, sizeof(char)*_N_LENSLET_ ) ); */ /* HANDLE_ERROR( cudaMemset(L_mask, 0, sizeof(char)*_N_LENSLET_ ) ); */ /* HANDLE_ERROR( cudaMalloc((void**)&A_mask, sizeof(char)*PS_E_N_PX ) ); */ /* HANDLE_ERROR( cudaMemset(A_mask, 0, sizeof(char)*PS_E_N_PX ) ); */ /* fried_geometry LLL gridDim_vl,blockDim_vl RRR (L_mask, A_mask, N_SIDE_LENSLET, 16, 0.5); */ /* dev2file("L_mask.bin",L_mask,_N_LENSLET_); */ /* dev2file("A_mask.bin",A_mask,PS_E_N_PX); */ /* HANDLE_ERROR( cudaFree( L_mask) ); */ /* HANDLE_ERROR( cudaFree( A_mask) ); */ char MASK_SET; MASK_SET = fried_geometry_setup(cog.lenslet_mask, pupil_mask.m, N_SIDE_LENSLET, 16, 0.5); //printf("MASK_SET = %d\n",MASK_SET); pupil_mask.set_filter(); src.wavefront.M = &pupil_mask; src.wavefront.masked(); char A_mask_filename[32]; sprintf(A_mask_filename,"A_mask_%03d.bin",N_SIDE_LENSLET); dev2file(A_mask_filename,pupil_mask.m,PS_E_N_PX); HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est , sizeof(float)*PS_E_N_PX ) ); float *d__phase_screen_low_res, *d__phase_screen_est, *phase_screen_low_res; d__phase_screen_low_res = 0; phase_screen_low_res = 0; phase_screen_est = 0; @ \subsection{Main test} \label{sec:main-test} The [[main test]] is simply a wavefront reconstruction routine. <
>= atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0); src.phase2file("phaseScreenLowRes.bin"); <> // Iterative solver //iSolve.cg_setup(N); iSolve.minres_setup(N); //aaCov.mask = d__mask; dev2file("mask.bin",d__mask,_N_LENSLET_); char filename[100]; /* for (k=1;k<6;k++) { */ k = 100; sprintf(filename,"MINRES_phaseEst_%d.bin",k); //sprintf(filename,"CG_phaseEst_%d.bin",k); HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); tid.tic(); //cudaProfilerStart(); iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x); //iSolve.cg(d__x, &aaCov, cog.d__c, k, d__x); //cudaProfilerStop(); <> tid.toc("WAVEFRONT ESTIMATION"); <> //printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); //printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); /* } */ @ \subsection{Convergence test} \label{sec:convergence-test} The [[convergence test]] reconstructs the wavefront with both iterative solver MINRES and CG at each iteration step. <>= atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0); char filename[100]; sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET); src.phase2file(filename); <> if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask; sprintf(filename,"bins/CVGCE_%s_phaseEst_200_%03d.bin",solver,N_SIDE_LENSLET); FILE *fid; fid = fopen(filename,"wb"); int n_page = 1, n_data = PS_E_N_PX*200; fwrite(&n_page,sizeof(int),1,fid); fwrite(&n_data,sizeof(int),1,fid); phase_screen_est = (float*)malloc(sizeof(float)*PS_E_N_PX); HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); // CG if (strcmp(solver,"CG")==0) { iSolve.pcg_setup(N); for (k=1;k<=200;k++) { printf("\n______ ITERATION #: %03d ______\n",k); HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); tid.tic(); iSolve.pcg(d__x, &aaCov, cog.d__c, k, d__x, 1./aa.variance()); <> tid.toc("WAVEFRONT ESTIMATION"); HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid); printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); printf("\n-------------------------------\n"); } } // MINRES if (strcmp(solver,"MINRES")==0) { iSolve.minres_setup(N); iSolve.RTOL = 1e-6; for (k=1;k<=200;k++) { printf("\n______ ITERATION #: %03d ______\n",k); HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); tid.tic(); iSolve.minres_vorst(d__x, &aaCov, cog.d__c, k, d__x); <> tid.toc("WAVEFRONT ESTIMATION"); HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid); printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); printf("\n-------------------------------\n"); } } fclose(fid); @ \subsection{Hot convergence test} \label{sec:hot-convergence-test} The [[hot convergence test]] reconstructs the wavefront with MINRES re-using the previous estimate as starting point for the next one. <>= LMMSE lmmse; lmmse.setup(&atm,&gs,1,&src,1,d,N_SIDE_LENSLET,&pupil_mask,"MINRES"); int n_step = 200; HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) ); float *d__gs_phase_screen_low_res; HANDLE_ERROR( cudaMalloc( (void**)&d__gs_phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) ); HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_est, sizeof(float)*PS_E_N_PX*n_step ) ); //if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask; HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); float time_step,tau=0, tau0=0, delta_tau; time_step = 2; //iSolve.minres_setup(N); lmmse.iSolve.VERBOSE = 0; lmmse.iSolve.RTOL = 5E-2; // First wavefront estimate atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,tau); tid.tic(); //iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x); lmmse.estimation(&cog); tid.toc(&tau0,"WAVEFRONT ESTIMATION"); tau = tau0; //lmmse.iSolve.N_ITERATION = 5; //lmmse.iSolve.ATOL = lmmse.iSolve.rnorm; // MINRES for (k=1;k<=n_step;k++) { printf("\n______ ITERATION #: %03d ______\n",k); tau += time_step; atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,1E-3*tau); // lmmse.d__phase_est_c = d__phase_screen_est + (k-1)*PS_E_N_PX; lmmse.set_phase_est_ptr( d__phase_screen_est + (k-1)*PS_E_N_PX ); tid.tic(); /* iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x); */ /* paCov.MVM(d__phase_screen_est + (k-1)*PS_E_N_PX, d__x); */ lmmse.estimation(&cog); tid.toc(&delta_tau, "WAVEFRONT ESTIMATION"); //tau += delta_tau + 0*time_step; src.wavefront.phase = d__phase_screen_low_res + (k-1)*PS_E_N_PX; atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,1E-3*tau); gs.wavefront.phase = d__gs_phase_screen_low_res + (k-1)*PS_E_N_PX; atm.get_phase_screen(&gs,delta_e,NP,delta_e,NP,1E-3*tau); /* HANDLE_ERROR( cudaMemcpy( d__phase_screen_low_res + (k-1)*PS_E_N_PX, src.wavefront.phase, */ /* sizeof(float)*PS_E_N_PX, */ /* cudaMemcpyDeviceToDevice) ); */ //printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); //printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); printf("\n-------------------------------\n"); /* if (k==1) */ /* iSolve.ATOL = iSolve.rnorm; */ //if (k>1) //iSolve.N_ITERATION = 10; } char filename[100]; sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET); dev2file(filename,d__phase_screen_low_res,PS_E_N_PX*n_step); sprintf(filename,"CVGCE_GS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET); dev2file(filename,d__gs_phase_screen_low_res,PS_E_N_PX*n_step); sprintf(filename,"CVGCE_%s_phaseEst_%03d_%03d.bin",solver,n_step,N_SIDE_LENSLET); dev2file(filename,d__phase_screen_est,PS_E_N_PX*n_step); lmmse.cleanup(); HANDLE_ERROR( cudaFree( d__gs_phase_screen_low_res ) ); @ \subsection{Statistics test} \label{sec:statistics-test} The [[statistics test]] reconstructs the wavefront n times, each time a new statistically independent phase screen is generated. <>= float *d__phase_screen_low_res; HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX ) ); float *phase_screen_low_res; phase_screen_low_res = (float*)malloc(sizeof(float)*PS_E_N_PX); HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); <> aaCov.mask = d__mask; <<<> FILE *fid0, *fid; char filename[100]; sprintf(filename,"STATS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET); fid0 = fopen(filename,"wb"); int nSample = 200, nIt = 200; sprintf(filename,"STATS_%s_phaseEst_%03d_%03d.bin",solver,nSample,N_SIDE_LENSLET); fid = fopen(filename,"wb"); phase_screen_est = (float*)malloc(sizeof(float)*PS_E_N_PX); HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est , sizeof(float)*PS_E_N_PX ) ); // CG if (strcmp(solver,"CG")==0) { iSolve.cg_setup(N); for (k=1;k<=nSample;k++) { printf("\n______ ITERATION #: %03d ______\n",k); <> HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); tid.tic(); iSolve.cg(d__x, &aaCov, cog.d__c, nIt, d__x); <> tid.toc("WAVEFRONT ESTIMATION"); HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid); printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); printf("\n-------------------------------\n"); atm.reset(); } } // MINRES if (strcmp(solver,"MINRES")==0) { iSolve.minres_setup(N); for (k=1;k<=nSample;k++) { printf("\n______ ITERATION #: %03d ______\n",k); <> HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) ); tid.tic(); iSolve.minres_vorst(d__x, &aaCov, cog.d__c, nIt, d__x); <> tid.toc("WAVEFRONT ESTIMATION"); HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); HANDLE_ERROR( cudaThreadSynchronize() ); fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid); printf("\n Data saved to file!\n"); printf("\nSolver residue norm : %.2E\n",iSolve.rnorm); printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration); printf("\n-------------------------------\n"); atm.reset(); } } fclose(fid0); fclose(fid); @ This is the place where we terminate the test elegantly. <>= atm.cleanup(); lenslet_array.cleanup(); cog.cleanup(); aa.cleanup(); pa.cleanup(); aaCov.cleanup(); paCov.cleanup(); iSolve.cleanup(); S.cleanup(); HANDLE_ERROR( cudaFree( d__x ) ); HANDLE_ERROR( cudaFree( d__phase_est ) ); if (d__phase_screen_low_res) HANDLE_ERROR( cudaFree( d__phase_screen_low_res ) ); if (d__phase_screen_est) HANDLE_ERROR( cudaFree( d__phase_screen_est ) ); HANDLE_ERROR( cudaFree( d__mask ) ); if (phase_screen_low_res) free(phase_screen_low_res); if (phase_screen_est) free(phase_screen_est); @ <>= __global__ void set_pa_input(float *pa_c, float *aa_c, int *idx, int N) { int i, j, k; i = blockIdx.x * blockDim.x + threadIdx.x; j = blockIdx.y * blockDim.y + threadIdx.y; if ( (i>= atm.get_phase_screen(d__phase_screen_low_res,delta_e,NP,delta_e,NP,d__src,0); HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res, sizeof(float)*PS_E_N_PX, cudaMemcpyDeviceToHost ) ); fwrite(phase_screen_low_res,sizeof(float),PS_E_N_PX,fid0); <> @ The wavefront sensing model is using either a Fourier optics or geometric optics model. <>= <> <> <>= <> <> @ <>= lenslet_array.propagate(d__src); cxy0 = (_N_PX_PUPIL_ - 1)/2.0; cog.get_data(lenslet_array.d__frame, cxy0, cxy0, slopes2Angle); @ <>= atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&src,1,0); <>= atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d__mask,d,&src,1,0); @ <>= dev2file("centroids.bin",cog.d__c,N); @ The index of the WFS slopes in the augmented input vector of the MVM operation are computed next: <>= idx = (int *)malloc(sizeof(int)*_N_LENSLET_); k = -1; //printf("\n k idx\n"); for (i=1;i<2*N_SIDE_LENSLET;i+=2) { for (j=1;j<2*N_SIDE_LENSLET;j+=2) { idx[++k] = i*(2*N_SIDE_LENSLET + 1) + j; //printf("(%2d) %2d\n",k,idx[k]); } } HANDLE_ERROR( cudaMalloc( (void**)&d__idx, sizeof(int)*_N_LENSLET_ ) ); HANDLE_ERROR( cudaMemcpy( d__idx, idx, sizeof(int)*_N_LENSLET_, cudaMemcpyHostToDevice ) ); HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) ); x = (float*)malloc(sizeof(float)*N); @ CG iterative solver call: <>= iSolve.cg(d__x, &aaCov, d__b, 5, d__x); @ MINRES iterative solver call: <>= iSolve.minres(d__x, &aaCov, d__b, 5, d__x); @ Allocation of the input vector for the MVM: <>= HANDLE_ERROR( cudaMalloc((void**)&d__ce, sizeof(float)*PS_E_N_PX*2 ) ); HANDLE_ERROR( cudaMemset(d__ce, 0, sizeof(float)*PS_E_N_PX*2 ) ); dim3 blockDim(16,16); dim3 gridDim(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1); @ Writing the input of the iterative solver into the output of the MVM: <>= paCov.MVM(d__phase_est,d__x); @ Saving the MVM wavefront estimate to a file: <>= dev2file(filename,d__phase_est,PS_E_N_PX); @