/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../util_type.cuh" #include "../util_namespace.cuh" #include "../block/block_load.cuh" #include "../block/block_store.cuh" #include "../block/block_adjacent_difference.cuh" #include CUB_NAMESPACE_BEGIN template < int _BLOCK_THREADS, int _ITEMS_PER_THREAD = 1, cub::BlockLoadAlgorithm _LOAD_ALGORITHM = cub::BLOCK_LOAD_DIRECT, cub::CacheLoadModifier _LOAD_MODIFIER = cub::LOAD_LDG, cub::BlockStoreAlgorithm _STORE_ALGORITHM = cub::BLOCK_STORE_DIRECT> struct AgentAdjacentDifferencePolicy { static constexpr int BLOCK_THREADS = _BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = _ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD; static constexpr cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; static constexpr cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; }; template struct AgentDifference { using LoadIt = typename THRUST_NS_QUALIFIER::cuda_cub::core::LoadIterator::type; using BlockLoad = typename cub::BlockLoadType::type; using BlockStore = typename cub::BlockStoreType::type; using BlockAdjacentDifferenceT = cub::BlockAdjacentDifference; union _TempStorage { typename BlockLoad::TempStorage load; typename BlockStore::TempStorage store; typename BlockAdjacentDifferenceT::TempStorage adjacent_difference; }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; static constexpr int BLOCK_THREADS = Policy::BLOCK_THREADS; static constexpr int ITEMS_PER_THREAD = Policy::ITEMS_PER_THREAD; static constexpr int ITEMS_PER_TILE = Policy::ITEMS_PER_TILE; static constexpr int SHARED_MEMORY_SIZE = static_cast(sizeof(TempStorage)); _TempStorage &temp_storage; InputIteratorT input_it; LoadIt load_it; InputT *first_tile_previous; OutputIteratorT result; DifferenceOpT difference_op; OffsetT num_items; __device__ __forceinline__ AgentDifference(TempStorage &temp_storage, InputIteratorT input_it, InputT *first_tile_previous, OutputIteratorT result, DifferenceOpT difference_op, OffsetT num_items) : temp_storage(temp_storage.Alias()) , input_it(input_it) , load_it( THRUST_NS_QUALIFIER::cuda_cub::core::make_load_iterator(Policy(), input_it)) , first_tile_previous(first_tile_previous) , result(result) , difference_op(difference_op) , num_items(num_items) {} template __device__ __forceinline__ void consume_tile_impl(int num_remaining, int tile_idx, OffsetT tile_base) { InputT input[ITEMS_PER_THREAD]; OutputT output[ITEMS_PER_THREAD]; if (IS_LAST_TILE) { // Fill last elements with the first element // because collectives are not suffix guarded BlockLoad(temp_storage.load) .Load(load_it + tile_base, input, num_remaining, *(load_it + tile_base)); } else { BlockLoad(temp_storage.load).Load(load_it + tile_base, input); } CTA_SYNC(); if (ReadLeft) { if (IS_FIRST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeft(input, output, difference_op); } else { InputT tile_prev_input = InPlace ? first_tile_previous[tile_idx] : *(input_it + tile_base - 1); BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractLeft(input, output, difference_op, tile_prev_input); } } else { if (IS_LAST_TILE) { BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRightPartialTile(input, output, difference_op, num_remaining); } else { InputT tile_next_input = InPlace ? first_tile_previous[tile_idx] : *(input_it + tile_base + ITEMS_PER_TILE); BlockAdjacentDifferenceT(temp_storage.adjacent_difference) .SubtractRight(input, output, difference_op, tile_next_input); } } CTA_SYNC(); if (IS_LAST_TILE) { BlockStore(temp_storage.store) .Store(result + tile_base, output, num_remaining); } else { BlockStore(temp_storage.store).Store(result + tile_base, output); } } template __device__ __forceinline__ void consume_tile(int num_remaining, int tile_idx, OffsetT tile_base) { if (tile_idx == 0) { consume_tile_impl(num_remaining, tile_idx, tile_base); } else { consume_tile_impl(num_remaining, tile_idx, tile_base); } } __device__ __forceinline__ void Process(int tile_idx, OffsetT tile_base) { OffsetT num_remaining = num_items - tile_base; if (num_remaining > ITEMS_PER_TILE) // not a last tile { consume_tile(num_remaining, tile_idx, tile_base); } else { consume_tile(num_remaining, tile_idx, tile_base); } } }; template struct AgentDifferenceInit { static constexpr int BLOCK_THREADS = 128; static __device__ __forceinline__ void Process(int tile_idx, InputIteratorT first, InputT *result, OffsetT num_tiles, int items_per_tile) { OffsetT tile_base = static_cast(tile_idx) * items_per_tile; if (tile_base > 0 && tile_idx < num_tiles) { if (ReadLeft) { result[tile_idx] = first[tile_base - 1]; } else { result[tile_idx - 1] = first[tile_base]; } } } }; CUB_NAMESPACE_END