/* Copyright 2016 The Science and Technology Facilities Council (STFC) * * Authors: Jonathan Hogg (STFC) * * Licence: BSD licence, see LICENCE file for details * */ #include "ldlt_app.hxx" #include #include #include #include #include "AlignedAllocator.hxx" #include "framework.hxx" #include "ssids/cpu/kernels/wrappers.hxx" #include "ssids/cpu/kernels/ldlt_app.cxx" // .cxx as we need internal namespace #include "ssids/cpu/kernels/ldlt_tpp.hxx" #include "ssids/cpu/cpu_iface.hxx" using namespace spral::ssids::cpu; template void print_d(int n, T *d) { bool first = true; for(int i=0; i::infinity() && d[2*i+1]!=0.0) { // Second half of 2x2 pivot: don't print brackets std::cout << " "; } else if(first) { std::cout << "("; } else { std::cout << ") ("; } if(d[2*i] == std::numeric_limits::infinity() && d[2*i+1]!=0.0) std::cout << std::setw(8) << " "; else std::cout << std::setw(8) << d[2*i]; first = false; } std::cout << ")" << std::endl; first = true; for(int i=0; i::infinity() && d[2*i+1]!=0.0) { // Second half of 2x2 pivot: don't print brackets std::cout << " "; } else if(first) { std::cout << "("; } else { std::cout << ") ("; } if(d[2*i+1] == 0.0) std::cout << std::setw(8) << " "; else std::cout << std::setw(8) << d[2*i+1]; first = false; } std::cout << ")" << std::endl; } template void solve(int m, int n, const int *perm, const T *l, int ldl, const T *d, const T *b, T *x) { for(int i=0; i void make_singular(int n, int col1, int col2, T *a, int lda) { T *col = new T[n]; T a11 = a[col1*(lda+1)]; T a21 = (col1 < col2) ? a[col1*lda + col2] : a[col2*lda + col1]; T scal = a21 / a11; // Read col1 and double it for(int i=0; i void make_dblk_singular(int blk, int nblk, T *a, int lda) { int col1 = 0; int col2 = BLOCK_SIZE-1; T *adiag = &a[blk*BLOCK_SIZE*(lda+1)]; make_singular(BLOCK_SIZE, col1, col2, adiag, lda); } // Pick n/8 random rows and multiply by 1000. Then do the same for n/8 random entries. template void cause_delays(int n, T *a, int lda) { int nsing = n/8; if(nsing==0) nsing=1; for(int i=0; iBLOCK_SIZE && idx col) a[col*lda+row] *= 1000; else a[row*lda+col] *= 1000; } } // Reorders a into supplied permuation such that row/col i of the new a // contains row/col perm[i] of the original a template void reorder(int n, const int *perm, T *a, int lda) { T *a2 = new T[n*n]; // Copy to a2 with permutation for(int j=0; j void do_update(int n, int k, T *a22, const T *a21, int lda, const T* d) { // Form A_21 D_11 T *ad21 = new T[n*k]; for(int j=0; j(OP_N, OP_T, n, n, k, -1.0, ad21, n, a21, lda, 1.0, a22, lda); // Free memory delete[] ad21; } /// Permutes rows of a as per perm such that row i becomes row (perm[i]-offset) template void permute_rows(int n, int k, const int *reord, int *perm, T *a, int lda) { // Copy a and perm without permutation T *a2 = new T[n*k]; for(int j=0; j void modify_test_matrix(bool singular, bool delays, bool dblk_singular, int m, int n, T *a, int lda) { int mblk = m / BLOCK_SIZE; int nblk = n / BLOCK_SIZE; if(delays) cause_delays(m, a, lda); if(dblk_singular) { int blk = nblk * ((float) rand())/RAND_MAX; make_dblk_singular(blk, mblk, a, lda); } if(n>1 && singular) { int col1 = n * ((float) rand())/RAND_MAX; int col2 = col1; while(col1 == col2) col2 = n * ((float) rand())/RAND_MAX; make_singular(m, col1, col2, a, lda); } } template int ldlt_test(T u, T small, bool delays, bool singular, bool dblk_singular, int m, int n, int outer_block_size=INNER_BLOCK_SIZE, int test=0, int seed=0) { // Note: We generate an m x m test matrix, then factor it as an // m x n matrix followed by an (m-n) x (m-n) matrix [give or take delays] bool failed = false; // Generate test matrix int lda = align_lda(m); double* a = new T[m*lda]; gen_sym_indef(m, a, lda); modify_test_matrix( singular, delays, dblk_singular, m, n, a, lda ); // Generate a RHS based on x=1, b=Ax T *b = new T[m]; gen_rhs(m, a, lda, b); // Print out matrices if requested if(debug) { std::cout << "A:" << std::endl; print_mat("%10.2e", m, a, lda); } // Setup options struct cpu_factor_options options; options.action = true; options.multiplier = 2.0; options.small = small; options.u = u; options.print_level = 0; options.small_subtree_threshold = 100*100*100; options.cpu_block_size = 256; options.pivot_method = (aggressive) ? PivotMethod::app_aggressive : PivotMethod::app_block; // Factorize using main routine spral::test::AlignedAllocator allocT; T *l = allocT.allocate(m*lda); memcpy(l, a, m*lda*sizeof(T)); // Copy a to l int *perm = new int[m]; for(int i=0; i backup(m, n, outer_block_size); std::vector work; const int SSIDS_PAGE_SIZE = 8*1024*1024; // 8 MB for(int i=0; i, use_tasks, debug> ::factor( m, n, perm, l, lda, d, backup, options, options.pivot_method, outer_block_size, 0.0, nullptr, 0, work ); if(debug) { std::cout << "FIRST FACTOR CALL ELIMINATED " << q1 << " of " << n << " pivots" << std::endl; std::cout << "L after first elim:" << std::endl; print_mat("%10.2e", m, l, lda, perm); std::cout << "D:" << std::endl; print_d(q1, d); } int q2 = 0; if(q1 < n) { // Finish off with simplistic kernel T *ld = new T[2*m]; q1 += ldlt_tpp_factor(m-q1, n-q1, &perm[q1], &l[(q1)*(lda+1)], lda, &d[2*(q1)], ld, m, options.action, u, small, q1, &l[q1], lda); delete[] ld; } if(m > n) { // Apply outer product update do_update(m-n, q1, &l[n*(lda+1)], &l[n], lda, d); // Second (m-n) x (m-n) matrix [but add delays if any] int *perm2 = new int[m-q1]; for(int i=0; i backup(m-q1, m-q1, outer_block_size); q2 = LDLT , use_tasks, debug> ::factor( m-q1, m-q1, perm2, &l[q1*(lda+1)], lda, &d[2*q1], backup, options, options.pivot_method, outer_block_size, 0.0, nullptr, 0, work ); // Permute rows of A_21 as per perm permute_rows(m-q1, q1, perm2, &perm[q1], &l[q1], lda); delete[] perm2; if(q1+q2 < m) { // Finish off with simplistic kernel T *ld = new T[2*m]; q2 += ldlt_tpp_factor(m-q1-q2, m-q1-q2, &perm[q1+q2], &l[(q1+q2)*(lda+1)], lda, &d[2*(q1+q2)], ld, m, options.action, u, small, q1+q2, &l[q1+q2], lda); delete[] ld; } } EXPECT_EQ(m, q1+q2) << "(test " << test << " seed " << seed << ")" << std::endl; // Print out matrices if requested if(debug) { std::cout << "q1=" << q1 << " q2=" << q2 << std::endl; std::cout << "L:" << std::endl; print_mat("%10.2e", m, l, lda, perm); std::cout << "D:" << std::endl; print_d(m, d); } // Perform solve T *soln = new T[m]; solve(m, q1, perm, l, lda, d, b, soln); if(debug) { printf("soln = "); for(int i=0; i void print_mat (int n, int *perm, T *a, int lda) { for(int i=0; i int ldlt_torture_test(T u, T small, int m, int n) { for(int test=0; test(u, small, delays, singular, dblk_singular, m, n, BLOCK_SIZE, test, seed); if(err!=0) return err; } return 0; // Success } int run_ldlt_app_tests() { int nerr = 0; /* Simple tests, single level blocking, rectangular, app_block */ TEST(( ldlt_test (0.01, 1e-20, false, false, false, 2, 1) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 4, 2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 5, 3) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 8, 2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 64, 24) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 23, 9) )); /* Simple tests, single level blocking, rectangular, app_aggressive */ TEST(( ldlt_test (0.01, 1e-20, false, false, false, 2, 1) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 4, 2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 5, 3) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 8, 2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 64, 24) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 23, 9) )); /* Simple tests, two-level blocking, rectangular */ TEST(( ldlt_test (0.01, 1e-20, false, false, false, 2, 1, 4) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 4, 2, 4) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 5, 3, 4) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 8, 2, 4) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 23, 9, 32) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 32, 12, 8) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 64, 24, 32) )); /* Simple tests, single-level blocking, rectangular, non-singular, * with delays */ TEST(( ldlt_test (0.01, 1e-20, true, false, false, 4, 2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 8, 2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 64, 24) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 23, 9) )); /* Simple tests, single-level blocking, rectangular, non-singular, * with delays, app-aggressive */ TEST(( ldlt_test (0.01, 1e-20, true, false, false, 4, 2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 8, 2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 64, 24) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 23, 9) )); /* Simple tests, single-level blocking, square, non-singular, no delays */ TEST(( ldlt_test (0.01, 1e-20, false, false, false, 1*2, 1*2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 2*2, 2*2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 8*2, 8*2) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 8*8, 8*8) )); TEST(( ldlt_test (0.01, 1e-20, false, false, false, 27, 27) )); /* Simple tests, single-level blocking, square, non-singular, with delays */ TEST(( ldlt_test (0.01, 1e-20, true, false, false, 1*2, 1*2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 2*2, 2*2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 4*2, 4*2) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 8*8, 8*8) )); TEST(( ldlt_test (0.01, 1e-20, true, false, false, 29, 29) )); /* Test edge case of singular diagonal blocks, square non-singular matrix */ TEST(( ldlt_test (0.01, 1e-20, false, false, true, 8*8, 8*8) )); TEST(( ldlt_test (0.01, 1e-20, false, false, true, 33, 33) )); /* Torture tests */ TEST(( ldlt_torture_test (0.01, 1e-20, 8*16, 8*16) )); TEST(( ldlt_torture_test (0.01, 1e-20, 8*16, 3*16) )); return nerr; }