/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN template void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT *result, OffsetT num_tiles, int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); } template void __global__ DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input, InputT *first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) { using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AdjacentDifferencePolicy; using Agent = AgentDifference; extern __shared__ char shmem[]; typename Agent::TempStorage &storage = *reinterpret_cast(shmem); Agent agent(storage, input, first_tile_previous, result, difference_op, num_items); int tile_idx = static_cast(blockIdx.x); OffsetT tile_base = static_cast(tile_idx) * ActivePolicyT::ITEMS_PER_TILE; agent.Process(tile_idx, tile_base); } template struct DeviceAdjacentDifferencePolicy { using ValueT = typename std::iterator_traits::value_type; //------------------------------------------------------------------------------ // Architecture-specific tuning policies //------------------------------------------------------------------------------ struct Policy300 : ChainedPolicy<300, Policy300, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE>; }; struct Policy350 : ChainedPolicy<350, Policy350, Policy300> { using AdjacentDifferencePolicy = AgentAdjacentDifferencePolicy<128, Nominal8BItemsToItems(7), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE>; }; using MaxPolicy = Policy350; }; template > struct DispatchAdjacentDifference : public SelectedPolicy { using InputT = typename std::iterator_traits::value_type; using OutputT = detail::invoke_result_t; void *d_temp_storage; std::size_t &temp_storage_bytes; InputIteratorT d_input; OutputIteratorT d_output; OffsetT num_items; DifferenceOpT difference_op; cudaStream_t stream; bool debug_synchronous; CUB_RUNTIME_FUNCTION __forceinline__ DispatchAdjacentDifference(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_input(d_input) , d_output(d_output) , num_items(num_items) , difference_op(difference_op) , stream(stream) , debug_synchronous(debug_synchronous) {} /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke() { using AdjacentDifferencePolicyT = typename ActivePolicyT::AdjacentDifferencePolicy; using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; using AgentDifferenceT = AgentDifference; cudaError error = cudaSuccess; do { const int tile_size = AdjacentDifferencePolicyT::ITEMS_PER_TILE; const int num_tiles = static_cast(DivideAndRoundUp(num_items, tile_size)); int shmem_size = AgentDifferenceT::SHARED_MEMORY_SIZE; std::size_t first_tile_previous_size = InPlace * num_tiles * sizeof(InputT); void *allocations[1] = {nullptr}; std::size_t allocation_sizes[1] = {first_tile_previous_size}; if (InPlace) { if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) { break; } if (d_temp_storage == nullptr) { // Return if the caller is simply requesting the size of the storage // allocation if (temp_storage_bytes == 0) { temp_storage_bytes = 1; } break; } } if (num_items == OffsetT{}) { break; } auto first_tile_previous = reinterpret_cast(allocations[0]); if (InPlace) { using AgentDifferenceInitT = AgentDifferenceInit; const int init_block_size = AgentDifferenceInitT::BLOCK_THREADS; const int init_grid_size = DivideAndRoundUp(num_tiles, init_block_size); if (debug_synchronous) { _CubLog("Invoking DeviceAdjacentDifferenceInitKernel" "<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_block_size, reinterpret_cast(stream)); } THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, d_input, first_tile_previous, num_tiles, tile_size); if (debug_synchronous) { if (CubDebug(error = SyncStream(stream))) { break; } } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } } if (debug_synchronous) { _CubLog("Invoking DeviceAdjacentDifferenceDifferenceKernel" "<<<%d, %d, 0, %lld>>>()\n", num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, reinterpret_cast(stream)); } THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, shmem_size, stream) .doit(DeviceAdjacentDifferenceDifferenceKernel, d_input, first_tile_previous, d_output, difference_op, num_items); if (debug_synchronous) { if (CubDebug(error = SyncStream(stream))) { break; } } // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) { break; } } while (0); return error; } CUB_RUNTIME_FUNCTION static cudaError_t Dispatch(void *d_temp_storage, std::size_t &temp_storage_bytes, InputIteratorT d_input, OutputIteratorT d_output, OffsetT num_items, DifferenceOpT difference_op, cudaStream_t stream, bool debug_synchronous) { using MaxPolicyT = typename DispatchAdjacentDifference::MaxPolicy; cudaError error = cudaSuccess; do { // Get PTX version int ptx_version = 0; if (CubDebug(error = PtxVersion(ptx_version))) { break; } // Create dispatch functor DispatchAdjacentDifference dispatch(d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream, debug_synchronous); // Dispatch to chained policy if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) { break; } } while (0); return error; } }; CUB_NAMESPACE_END