/// /// @file fast_div.hpp /// @brief Integer division of small types is much faster than integer /// division of large types on most CPUs. The fast_div(x, y) /// function tries to take advantage of this by casting x and y /// to smaller types (if possible) before doing the division. /// /// If ENABLE_DIV32 is defined we check at runtime if the /// dividend and divisor are < 2^32 and if so we use 32-bit /// integer division instead of 64-bit integer division. On /// most CPUs before 2020 this significantly improves /// performance. /// /// On some new CPUs (such as Intel Cannonlake) 64-bit integer /// division has been improved significantly and runs as fast /// as 32-bit integer division. For such CPUs it is best to /// disable ENABLE_DIV32 (using cmake -DWITH_DIV32=OFF) as this /// avoids runtime checks for (64-bit / 32-bit) divisions. /// /// Copyright (C) 2020 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. /// #ifndef FAST_DIV_HPP #define FAST_DIV_HPP #include #include #include #include namespace { /// If ENABLE_DIV32 is defined: /// /// 1) We use 32-bit integer division for (64-bit / 32-bit) /// if the dividend is < 2^32. /// 2) We use 64-bit integer division for (64-bit / 64-bit). /// 3) We use 64-bit integer division for (128-bit / 64-bit) /// if the dividend is < 2^64. /// #if defined(ENABLE_DIV32) /// Get the next smaller integer type /// and convert it to unsigned. /// make_smaller< uint64_t>::type -> uint32_t. /// make_smaller::type -> uint64_t. /// template struct make_smaller { using type = typename std::conditional::type>::type; }; /// Used for (64-bit / 64-bit) = 64-bit. template typename std::enable_if<(sizeof(X) == sizeof(Y)), X>::type fast_div(X x, Y y) { // Unsigned integer division is usually // faster than signed integer division. using UX = typename std::make_unsigned::type; return (UX) x / (UX) y; } /// Used for (64-bit / 32-bit) = 64-bit. /// Used for (128-bit / 64-bit) = 128-bit. template typename std::enable_if<(sizeof(X) > sizeof(Y)), X>::type fast_div(X x, Y y) { using smaller_t = typename make_smaller::type; if (x <= std::numeric_limits::max()) return (smaller_t) x / (smaller_t) y; else { // Unsigned integer division is usually // faster than signed integer division. using UX = typename std::make_unsigned::type; using UY = typename std::make_unsigned::type; return (UX) x / (UY) y; } } #else /// If ENABLE_DIV32 is not defined: /// /// 1) We use 64-bit integer division for (64-bit / 32-bit). /// 2) We use 64-bit integer division for (64-bit / 64-bit). /// 3) We use 64-bit integer division for (128-bit / 64-bit) /// if the dividend is < 2^64. /// /// Get the next smaller integer type /// and convert it to unsigned. /// make_smaller::type -> uint64_t. /// template struct make_smaller { using type = typename std::make_unsigned< typename std::conditional< sizeof(T) == sizeof(uint64_t) * 2, uint64_t, T>::type>::type; }; /// Used for (64-bit / 32-bit) = 64-bit. /// Used for (64-bit / 64-bit) = 64-bit. template typename std::enable_if<(sizeof(X) >= sizeof(Y) && sizeof(X) <= sizeof(uint64_t)), X>::type fast_div(X x, Y y) { // Unsigned integer division is usually // faster than signed integer division. using UX = typename std::make_unsigned::type; return (UX) x / (UX) y; } /// Used for (128-bit / 64-bit) = 128-bit. template typename std::enable_if<(sizeof(X) > sizeof(Y) && sizeof(X) > sizeof(uint64_t)), X>::type fast_div(X x, Y y) { using smaller_t = typename make_smaller::type; if (x <= std::numeric_limits::max()) return (smaller_t) x / (smaller_t) y; else { // Unsigned integer division is usually // faster than signed integer division. using UX = typename std::make_unsigned::type; using UY = typename std::make_unsigned::type; return (UX) x / (UY) y; } } #endif /// Used for (128-bit / 64-bit) = 64-bit. /// Use this function only when you know for sure /// that the result is < 2^64. /// template typename std::enable_if<(sizeof(X) == sizeof(uint64_t) * 2 && sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type fast_div64(X x, Y y) { #if defined(__x86_64__) && \ (defined(__GNUC__) || defined(__clang__)) // primecount does not need signed division so // we use the unsigned division instruction further // down as DIV is usually faster than IDIV. assert(x >= 0 && y > 0); uint64_t x0 = (uint64_t) x; uint64_t x1 = ((uint64_t*) &x)[1]; uint64_t d = y; // (128-bit / 64-bit) = 64-bit. // When we know the result fits into 64-bit (even // though the numerator is 128-bit) we can use the divq // instruction instead of doing a full 128-bit division. __asm__("divq %[divider]" : "+a"(x0), "+d"(x1) : [divider] "r"(d)); return x0; #else return (uint64_t) fast_div(x, y); #endif } /// Used for (64-bit / 32-bit) = 64-bit. /// Used for (64-bit / 64-bit) = 64-bit. template typename std::enable_if::type fast_div64(X x, Y y) { return (uint64_t) fast_div(x, y); } } // namespace #endif