//------------------------------------------------------------------------------ // GB_transpose_method: select method for GB_transpose //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ #include "GB_transpose.h" // GB_transpose can use choose between a merge-sort-based method that takes // O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time. // The bucket sort has 3 methods: sequential, atomic, and non-atomic. bool GB_transpose_method // if true: use GB_builder, false: use bucket ( const GrB_Matrix A, // matrix to transpose int *nworkspaces_bucket, // # of slices of A for the bucket method int *nthreads_bucket // # of threads to use for the bucket method ) { //-------------------------------------------------------------------------- // get inputs //-------------------------------------------------------------------------- // if available, A->nvec_nonempty is used to select the method int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ; int64_t anz = GB_nnz (A) ; int64_t avlen = A->vlen ; // int64_t avdim = A->vdim ; int anzlog = (anz == 0) ? 1 : (int) GB_CEIL_LOG2 (anz) ; int mlog = (avlen == 0) ? 1 : (int) GB_CEIL_LOG2 (avlen) ; double bucket_factor ; // determine # of threads for bucket method int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ; //-------------------------------------------------------------------------- // select between the atomic and non-atomic bucket method //-------------------------------------------------------------------------- bool atomics ; if (nthreads <= 2) { // sequential bucket method: no atomics needed // 2 threads: always use non-atomic method atomics = false ; } else if ((double) nthreads * (double) avlen > (double) anz) { // non-atomic workspace is too high; use atomic method atomics = true ; } else { // select between atomic and non-atomic methods. This rule is based on // performance on a 4-core system with 4 threads with gcc 7.5. The icc // compiler has much slower atomics than gcc and so atol should likely // be smaller when using icc. int atol ; if (anzlog < 14) { atol = -4 ; // fewer than 16K entries in A } else { switch (anzlog) { case 14: atol = -4 ; break ; // 16K entries in A case 15: atol = -3 ; break ; // 32K case 16: atol = -2 ; break ; // 64K case 17: atol = -1 ; break ; // 128K case 18: atol = 0 ; break ; // 256K case 19: atol = 1 ; break ; // 512K case 20: atol = 2 ; break ; // 1M case 21: atol = 3 ; break ; // 2M case 22: atol = 4 ; break ; // 4M case 23: atol = 5 ; break ; // 8M case 24: atol = 6 ; break ; // 16M case 25: atol = 8 ; break ; // 32M case 26: atol = 9 ; break ; // 64M case 27: atol = 9 ; break ; // 128M case 28: atol = 10 ; break ; // 256M default: atol = 10 ; break ; // > 256M } } if (anzlog - mlog <= atol) { // use atomic method // anzlog - mlog is the log2 of the average row degree, rounded. // If the average row degree is <= 2^atol, use the atomic method. // That is, the atomic method works better for sparser matrices, // and the non-atomic works better or denser matrices. However, // the threshold changes as the problem gets larger, in terms of # // of entries in A, when the atomic method becomes more attractive // relative to the non-atomic method. The atomic has the // advantange of needing much less workspace, which becomes more // important for larger problems. atomics = true ; } else { // use non-atomic method atomics = false ; } } (*nworkspaces_bucket) = (atomics) ? 1 : nthreads ; (*nthreads_bucket) = nthreads ; //-------------------------------------------------------------------------- // select between GB_builder method and bucket method //-------------------------------------------------------------------------- // As the problem gets larger, the GB_builder method gets faster relative // to the bucket method, in terms of the "constants" in the O(a log a) work // for GB_builder, or O (a+m+n) for the bucket method. Clearly, O (a log // a) and O (a+m+n) do not fully model the performance of these two // methods. Perhaps this is because of cache effects. The bucket method // has more irregular memory accesses. The GB_builder method uses // mergesort, which has good memory locality. if (anzlog < 14) { bucket_factor = 0.5 ; // fewer than 2^14 = 16K entries } else { switch (anzlog) { case 14: bucket_factor = 0.6 ; break ; // 16K entries in A case 15: bucket_factor = 0.7 ; break ; // 32K case 16: bucket_factor = 1.0 ; break ; // 64K case 17: bucket_factor = 1.7 ; break ; // 128K case 18: bucket_factor = 3.0 ; break ; // 256K case 19: bucket_factor = 4.0 ; break ; // 512K case 20: bucket_factor = 6.0 ; break ; // 1M case 21: bucket_factor = 7.0 ; break ; // 2M case 22: bucket_factor = 8.0 ; break ; // 4M case 23: bucket_factor = 5.0 ; break ; // 8M case 24: bucket_factor = 5.0 ; break ; // 16M case 25: bucket_factor = 5.0 ; break ; // 32M case 26: bucket_factor = 5.0 ; break ; // 64M case 27: bucket_factor = 5.0 ; break ; // 128M case 28: bucket_factor = 5.0 ; break ; // 256M default: bucket_factor = 5.0 ; break ; // > 256M } } double bucket_work = (double) (anz + avlen + anvec) * bucket_factor ; double builder_work = (log2 ((double) anz + 1) * (anz)) ; //-------------------------------------------------------------------------- // select the method with the least amount of work //-------------------------------------------------------------------------- bool use_builder = (builder_work < bucket_work) ; return (use_builder) ; }