// ============================================================================= // === GPUQREngine/Demo/dense_demo.cpp ========================================= // ============================================================================= // GPUQREngine, Copyright (c) 2013, Timothy A Davis, Sencer Nuri Yeralan, // and Sanjay Ranka. All Rights Reserved. // SPDX-License-Identifier: GPL-2.0+ //------------------------------------------------------------------------------ #if 1 // Update for GPUQREngine v2.0.0: See SPQR for a demo instead. #include int main(int argn, char **argv) { printf ("Update for GPUQREngine v2.x: See SPQR for a demo instead.\n") ; } #else // GPUQREngine can be used to factorize a set of dense matrices of // various sizes. This is the demo for this 'dense' usage case. // The 'sparse' case is exercised by SuiteSparseQR. #ifndef SPQR_HAS_CUDA #define SPQR_HAS_CUDA #endif #include "GPUQREngine_SuiteSparse.hpp" #include #include "cholmod.h" #include #include #include "GPUQREngine_Timing.hpp" //------------------------------------------------------------------------------ void randfill(double *x, Int m, Int n) { for(int i=0; i xb) continue; int imax = MIN(m, TILESIZE*(yb+1)); for(int i=TILESIZE*yb; i 1e-12, error ('failure'); end\n"); // fprintf (troll, "[%d %d] = size (A) ;\n") ; // int rank = MIN(m, n); // fprintf (troll, "[Davis_R V1 T] = qrsigma_concise(A);\n"); // fprintf (troll, "norm(triu(Davis_R) - triu(R))\n"); // fprintf (troll, "norm(Davis_R(%d:end,:)-R(%d:end,:))\n", rank+1, rank+1); fprintf (troll, "MR = qr(A);\n"); fprintf (troll, "tR = triu(abs(R));"); fprintf (troll, "tMR = triu(abs(MR));"); fprintf (troll, "norm(tMR - tR)\n"); } //------------------------------------------------------------------------------ void driver ( cholmod_common *cc, const char *filename, Front *fronts, int numFronts, QREngineStats *stats, int which ) { FILE *troll = NULL ; if (filename != NULL) { troll = fopen(filename, "a"); } if (troll != NULL) { fprintf (troll, "clear ;\n") ; fprintf (troll, "A_%d = cell (1,%d) ;\n", which, numFronts) ; fprintf (troll, "R_%d = cell (1,%d) ;\n", which, numFronts) ; } for(int f=0; ffm; int n = front->fn; /* Attach the front data. */ double *F = front->cpuR = front->F = (double*) SuiteSparse_calloc(m*n, sizeof(double)); randfill(F, m, n); /* Print the A matrix. */ if (troll != NULL) { printMatrix (troll, "A_", which, f, F, m, n); } } /* Run the QREngine code */ QREngineResultCode result = GPUQREngine (cc->gpuMemorySize, fronts, numFronts, stats); if (result != QRENGINE_SUCCESS) { printf ("test failure!\n'") ; exit (0) ; } /* Do something with R factors. */ for(int f=0; ffm; int n = front->fn; double *R = front->cpuR; /* Print the R matrix. */ if (troll != NULL) { printMatrix(troll, "R_", which, f, R, m, n); fprintf (troll, "R = R_%d {%d} ; A = A_%d {%d} ;\n", which, 1+f, which, 1+f) ; printQRScript (troll) ; } /* Detach the front data. */ SuiteSparse_free(front->F); front->F = NULL; } if (troll != NULL) { fprintf (troll, "disp ('all tests passed') ;\n") ; fclose (troll) ; } } //------------------------------------------------------------------------------ void printStats(QREngineStats stats, int numFronts, double m, double n) { float kernelTime = stats.kernelTime; Int numLaunches = stats.numLaunches; Int gpuFlops = stats.flopsActual; /* Compute & Print FLOPS */ double time = (double) kernelTime; double flops; if(m >= n) { flops = 2.0 * n*n * (m - (n/3)); } else { flops = 2.0 * m*m * (n - (m/3)); } flops *= (double) numFronts; flops /= (time / 1e3); double gflops = flops / 1e9; double gpugflops = (gpuFlops / (time / 1e3)) / 1e9; printf("m: %.0f, n: %.0f, nf: %d, nl: %ld, gpuFlops: %ld, t: %fms, gflops: %f, gpugflops: %f\n", m, n, numFronts, numLaunches, gpuFlops, time, gflops, gpugflops); } //------------------------------------------------------------------------------ void experiment1(cholmod_common *cc, int which, int numFronts, int m, int n) { /* Configure problem set. */ Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front)); for(int f=0; f~Front(); SuiteSparse_free(fronts); } //------------------------------------------------------------------------------ void experiment2(cholmod_common *cc, int m, int n, int numFronts) { /* See if this experiment would blow out the memory. */ size_t threshold = 3.50 * 1024 * 1024 * 1024; size_t memoryReq = (size_t) (numFronts * (CEIL(m, 32) * 32 * 33 + m * n)) ; if(memoryReq * sizeof(double) > threshold) return; /* Configure problem set. */ Front *fronts = (Front*) SuiteSparse_calloc(numFronts, sizeof(Front)); for(int f=0; f~Front(); SuiteSparse_free(fronts); } //------------------------------------------------------------------------------ int main(int argn, char **argv) { double t ; size_t total_mem, available_mem ; /* Clear the troll file. */ FILE *troll; troll = fopen("troll.m", "w"); fclose(troll); srand(1); // start CHOLMOD cholmod_common *cc, Common ; cc = &Common ; cholmod_l_start (cc) ; // warmup the GPU. This can take some time, but only needs // to be done once cc->useGPU = true ; t = SUITESPARSE_TIME ; cholmod_l_gpu_memorysize (&total_mem, &available_mem, cc) ; cc->gpuMemorySize = available_mem ; t = SUITESPARSE_TIME - t ; if (cc->gpuMemorySize <= 1) { printf ("no GPU available\n") ; return (0) ; } printf ("available GPU memory: %g MB, warmup time: %g\n", (double) (cc->gpuMemorySize) / (1024 * 1024), t) ; experiment1(cc, 1, 2, 8, 8); experiment1(cc, 2, 2, 12, 8); experiment1(cc, 3, 2, 64, 32); experiment1(cc, 4, 1, 100, 200); printf ("to check results, run 'troll.m' in MATLAB\n") ; #if 0 for(int numFronts=1; numFronts<=128; numFronts*=2) { for(int dim=128; dim<=6144; dim+=128) { experiment2(cc, dim, dim, numFronts); } } #endif #if 0 for(int numFronts=1; numFronts<=128; numFronts*=2) { for(int smdim=128; smdim<=6144/4; smdim+=128) { experiment2(cc, smdim, 4*smdim, numFronts); experiment2(cc, 4*smdim, smdim, numFronts); } for(int smdim=128; smdim<=6144/16; smdim+=128) { experiment2(cc, smdim, 16*smdim, numFronts); experiment2(cc, 16*smdim, smdim, numFronts); } } #endif } #endif