/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /** \file ducc0/infra/bucket_sort.h * * \copyright Copyright (C) 2020-2021 Max-Planck-Society * \author Martin Reinecke */ #ifndef DUCC0_BUCKET_SORT_H #define DUCC0_BUCKET_SORT_H #include #include #include #include #include #include #include "ducc0/infra/error_handling.h" #include "ducc0/infra/threading.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/math/math_utils.h" namespace ducc0 { namespace detail_bucket_sort { using namespace std; template void subsort (RAidx idx, quick_array &keys, size_t keybits, size_t lo, size_t hi, vector &numbers, quick_array &idxbak, quick_array &keybak) { auto nval = hi-lo; if (nval<=1) return; size_t keyshift = (keybits<=8) ? 0 : keybits-8; size_t nkeys = min(size_t(1)<>keyshift)&keymask]; } Tidx ofs=0; for (auto &x: numbers) { auto tmp = x; x = ofs; ofs += tmp; } for (size_t i=0; i>keyshift)&keymask; keys[lo+numbers[loc]] = keybak[i]; idx[lo+numbers[loc]] = idxbak[i]; ++numbers[loc]; } if (keyshift==0) return; keybits -= 8; vector newnumbers; for (size_t i=0; i void bucket_sort (RAkey keys, RAidx res, size_t nval, size_t max_key, size_t nthreads) { nthreads = min(nthreads, max_threads()); using Tidx = typename remove_reference::type; using Tkey = typename remove_reference::type; // align members with cache lines struct alignas(64) vbuf { vector v; }; vector numbers(nthreads); auto keybits = ilog2(max_key)+1; size_t keyshift = (keybits<=8) ? 0 : keybits-8; size_t nkeys = min(size_t(1)<>keyshift)]; } }); size_t ofs=0; for (size_t i=0; i keys2(nval); execParallel(nval, nthreads, [&](size_t tid, size_t lo, size_t hi) { auto &mybuf(numbers[tid].v); for (size_t i=lo; i>keyshift); res[mybuf[loc]] = i; keys2[mybuf[loc]] = keys[i]; ++mybuf[loc]; } }); if (keyshift==0) return; keybits -= 8; execDynamic(nkeys, nthreads, 1, [&](Scheduler &sched) { vector newnumbers; quick_array keybak; quick_array idxbak; while (auto rng=sched.getNext()) for(auto i=rng.lo; i void bucket_sort2 (quick_array &keys, quick_array &idx, size_t max_key, size_t nthreads) { auto nval = keys.size(); idx.resize(nval); nthreads = min(nthreads, max_threads()); auto sizelimit = max(1, nval/nthreads); // align members with cache lines struct alignas(64) vbuf { vector v; }; vector numbers(nthreads); auto keybits = ilog2(max_key)+1; bool last_level = keybits<=8; size_t keyshift = last_level ? 0 : keybits-8; size_t nkeys = min(size_t(1)<>keyshift)]; } }); size_t ofs=0; for (size_t i=0; i keys2(nval); quick_array idx2(nval); execParallel(nval, nthreads, [&](size_t tid, size_t lo, size_t hi) { auto &mybuf(numbers[tid].v); for (size_t i=lo; i>keyshift); idx2[mybuf[loc]] = i; keys2[mybuf[loc]] = keys[i]; ++mybuf[loc]; } }); keybits -= 8; struct Workitem { size_t lo, hi; size_t keybits; bool src_in_2; }; vector items; items.reserve(nkeys); for (size_t i=0; i1) items.emplace_back(Workitem{lo, hi, keybits, true}); else if (hi-lo==1) idx[lo] = idx2[lo]; } function&)> do_work = [&keys,&keys2,&idx,&idx2](const Workitem &item, function &ifunc) { const auto &keys_in(item.src_in_2 ? keys2 : keys); const auto &idx_in(item.src_in_2 ? idx2 : idx); auto &keys_out(item.src_in_2 ? keys : keys2); auto &idx_out(item.src_in_2 ? idx : idx2); auto lo = item.lo; auto hi = item.hi; auto keybits = item.keybits; auto src_in_2 = item.src_in_2; auto nval = hi-lo; if (nval<=1) { if (src_in_2 && (nval==1)) idx[lo] = idx2[lo]; return; } bool last_level = keybits<=8; size_t keyshift = last_level ? 0 : keybits-8; size_t nkeys = min(size_t(1)< numbers; for (size_t i=0; i>keyshift)==(keys_in[lo]>>keyshift)); ++numbers[(keys_in[i+lo]>>keyshift)&keymask]; } if (all_keys_equal) // this subrange is completely sorted { if (item.src_in_2) for (size_t i=lo; i>keyshift)&keymask; idx_out[lo+numbers[loc]] = idx_in[lo+i]; ++numbers[loc]; } if (!item.src_in_2) for (size_t i=lo; i>keyshift)&keymask; keys_out[lo+numbers[loc]] = keys_in[lo+i]; idx_out[lo+numbers[loc]] = idx_in[lo+i]; ++numbers[loc]; } for (size_t i=0; i subprocess = [&subprocess, &do_work](const Workitem &item) { do_work(item, subprocess); }; execWorklist(nthreads, items, [sizelimit, &subprocess, &do_work](const Workitem &item, auto insert) { auto ifunc = (item.hi-item.lo>sizelimit) ? insert : subprocess; do_work(item, ifunc); }); } } using detail_bucket_sort::bucket_sort; using detail_bucket_sort::bucket_sort2; } #endif