/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* @file
* The cub::BlockAdjacentDifference class provides
* [collective](index.html#sec0) methods for computing the differences
* of adjacent elements partitioned across a CUDA thread block.
*/
#pragma once
#include "../config.cuh"
#include "../util_type.cuh"
#include "../util_ptx.cuh"
CUB_NAMESPACE_BEGIN
/**
* @brief BlockAdjacentDifference provides
* [collective](index.html#sec0) methods for computing the
* differences of adjacent elements partitioned across a CUDA thread
* block.
*
* @ingroup BlockModule
*
* @par Overview
* - BlockAdjacentDifference calculates the differences of adjacent elements in
* the elements partitioned across a CUDA thread block. Because the binary
* operation could be noncommutative, there are two sets of methods.
* Methods named SubtractLeft subtract left element `i - 1` of input sequence
* from current element `i`. Methods named SubtractRight subtract current
* element `i` from the right one `i + 1`:
* @par
* @code
* int values[4]; // [1, 2, 3, 4]
* //...
* int subtract_left_result[4]; <-- [ 1, 1, 1, 1 ]
* int subtract_right_result[4]; <-- [ -1, -1, -1, 4 ]
* @endcode
* - For SubtractLeft, if the left element is out of bounds, the
* output value is assigned to `input[0]` without modification.
* - For SubtractRight, if the right element is out of bounds, the output value
* is assigned to the current input value without modification.
* - The following example under the examples/block folder illustrates usage of
* dynamically shared memory with BlockReduce and how to re-purpose
* the same memory region:
* example_block_reduce_dyn_smem.cu
* This example can be easily adapted to the storage required by
* BlockAdjacentDifference.
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
* compute the left difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute adjacent_difference
* int result[4];
*
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* result,
* thread_data,
* CustomDifference());
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* { [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }.
* The corresponding output `result` in those threads will be
* { [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }.
*
*/
template
class BlockAdjacentDifference
{
private:
/***************************************************************************
* Constants and type definitions
**************************************************************************/
/// Constants
/// The thread block size in threads
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
/// Shared memory storage layout type (last element from each thread's input)
struct _TempStorage
{
T first_items[BLOCK_THREADS];
T last_items[BLOCK_THREADS];
};
/***************************************************************************
* Utility methods
**************************************************************************/
/// Internal storage allocator
__device__ __forceinline__ _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Specialization for when FlagOp has third index param
template ::HAS_PARAM>
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ T FlagT(FlagOp flag_op,
const T &a,
const T &b,
int idx)
{
return flag_op(b, a, idx);
}
};
/// Specialization for when FlagOp does not have a third index param
template
struct ApplyOp
{
// Apply flag operator
static __device__ __forceinline__ T FlagT(FlagOp flag_op,
const T &a,
const T &b,
int /*idx*/)
{
return flag_op(b, a);
}
};
/// Templated unrolling of item comparison (inductive case)
template
struct Iterate
{
/**
* Head flags
*
* @param[out] flags Calling thread's discontinuity head_flags
* @param[in] input Calling thread's input items
* @param[out] preds Calling thread's predecessor items
* @param[in] flag_op Binary boolean flag predicate
*/
template
static __device__ __forceinline__ void
FlagHeads(int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
T (&preds)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
preds[ITERATION] = input[ITERATION - 1];
flags[ITERATION] = ApplyOp::FlagT(
flag_op,
preds[ITERATION],
input[ITERATION],
(linear_tid * ITEMS_PER_THREAD) + ITERATION);
Iterate::FlagHeads(linear_tid,
flags,
input,
preds,
flag_op);
}
/**
* Tail flags
*
* @param[out] flags Calling thread's discontinuity head_flags
* @param[in] input Calling thread's input items
* @param[in] flag_op Binary boolean flag predicate
*/
template
static __device__ __forceinline__ void
FlagTails(int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
flags[ITERATION] = ApplyOp::FlagT(
flag_op,
input[ITERATION],
input[ITERATION + 1],
(linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
Iterate::FlagTails(linear_tid,
flags,
input,
flag_op);
}
};
/// Templated unrolling of item comparison (termination case)
template
struct Iterate
{
// Head flags
template
static __device__ __forceinline__ void
FlagHeads(int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD],
T (&/*input*/)[ITEMS_PER_THREAD],
T (&/*preds*/)[ITEMS_PER_THREAD],
FlagOp /*flag_op*/)
{}
// Tail flags
template
static __device__ __forceinline__ void
FlagTails(int /*linear_tid*/,
FlagT (&/*flags*/)[ITEMS_PER_THREAD],
T (&/*input*/)[ITEMS_PER_THREAD],
FlagOp /*flag_op*/)
{}
};
/***************************************************************************
* Thread fields
**************************************************************************/
/// Shared storage reference
_TempStorage &temp_storage;
/// Linear thread-id
unsigned int linear_tid;
public:
/// \smemstorage{BlockDiscontinuity}
struct TempStorage : Uninitialized<_TempStorage> {};
/***********************************************************************//**
* @name Collective constructors
**************************************************************************/
//@{
/**
* @brief Collective constructor using a private static allocation of shared
* memory as temporary storage.
*/
__device__ __forceinline__ BlockAdjacentDifference()
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* @brief Collective constructor using the specified memory allocation as
* temporary storage.
*
* @param[in] temp_storage Reference to memory allocation having layout type TempStorage
*/
__device__ __forceinline__ BlockAdjacentDifference(TempStorage &temp_storage)
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//@} end member group
/***********************************************************************//**
* @name Read left operations
**************************************************************************/
//@{
/**
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block
* // of 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* thread_data,
* thread_data,
* CustomDifference());
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to @p output)
*
* @param[in] difference_op
* Binary difference operator
*/
template
__device__ __forceinline__ void
SubtractLeft(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
output[item] = difference_op(input[item], input[item - 1]);
}
if (linear_tid == 0)
{
output[0] = input[0];
}
else
{
output[0] = difference_op(input[0],
temp_storage.last_items[linear_tid - 1]);
}
}
/**
* @brief Subtracts the left element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the left difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // The last item in the previous tile:
* int tile_predecessor_item = ...;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* thread_data,
* thread_data,
* CustomDifference(),
* tile_predecessor_item);
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* and that `tile_predecessor_item` is `3`. The corresponding output
* `result` in those threads will be
* `{ [1,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to \p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in] tile_predecessor_item
* [thread0 only] item which is going to be
* subtracted from the first tile item (input0 from
* thread0).
*/
template
__device__ __forceinline__ void
SubtractLeft(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
T tile_predecessor_item)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
output[item] = difference_op(input[item], input[item - 1]);
}
// Set flag for first thread-item
if (linear_tid == 0)
{
output[0] = difference_op(input[0], tile_predecessor_item);
}
else
{
output[0] = difference_op(input[0],
temp_storage.last_items[linear_tid - 1]);
}
}
/**
* @brief Subtracts the left element of each adjacent pair of elements partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
* compute the left difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
* thread_data,
* thread_data,
* CustomDifference());
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }`.
* The corresponding output `result` in those threads will be
* `{ [4,-2,-1,0], [0,0,0,0], [1,1,0,0], [0,1,-3,3], ... }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to \p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in]
* Number of valid items in thread block
*/
template
__device__ __forceinline__ void
SubtractLeftPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputType (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
if ((linear_tid + 1) * ITEMS_PER_THREAD <= valid_items)
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
output[item] = difference_op(input[item], input[item - 1]);
}
}
else
{
#pragma unroll
for (int item = ITEMS_PER_THREAD - 1; item > 0; item--)
{
const int idx = linear_tid * ITEMS_PER_THREAD + item;
if (idx < valid_items)
{
output[item] = difference_op(input[item], input[item - 1]);
}
else
{
output[item] = input[item];
}
}
}
if (linear_tid == 0 || valid_items <= linear_tid * ITEMS_PER_THREAD)
{
output[0] = input[0];
}
else
{
output[0] = difference_op(input[0],
temp_storage.last_items[linear_tid - 1]);
}
}
//@} end member group
/******************************************************************//**
* @name Read right operations
*********************************************************************/
//@{
/**
* @brief Subtracts the right element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the right difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractRight(
* thread_data,
* thread_data,
* CustomDifference());
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
* The corresponding output `result` in those threads will be
* `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,4] }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to \p output)
*
* @param[in] difference_op
* Binary difference operator
*/
template
__device__ __forceinline__ void
SubtractRight(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op)
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
{
output[item] = difference_op(input[item], input[item + 1]);
}
if (linear_tid == BLOCK_THREADS - 1)
{
output[ITEMS_PER_THREAD - 1] = input[ITEMS_PER_THREAD - 1];
}
else
{
output[ITEMS_PER_THREAD - 1] =
difference_op(input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1]);
}
}
/**
* @brief Subtracts the right element of each adjacent pair of elements
* partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference
* to compute the right difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // The first item in the nest tile:
* int tile_successor_item = ...;
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractRight(
* thread_data,
* thread_data,
* CustomDifference(),
* tile_successor_item);
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`,
* and that `tile_successor_item` is `3`. The corresponding output `result`
* in those threads will be
* `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,0,0], [-1,3,-3,1] }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to @p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in] tile_successor_item
* [threadBLOCK_THREADS-1 only] item
* which is going to be subtracted from the last tile item
* (inputITEMS_PER_THREAD-1 from
* threadBLOCK_THREADS-1).
*/
template
__device__ __forceinline__ void
SubtractRight(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
T tile_successor_item)
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1)
? tile_successor_item // Last thread
: temp_storage.first_items[linear_tid + 1];
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
{
output[item] = difference_op(input[item], input[item + 1]);
}
output[ITEMS_PER_THREAD - 1] =
difference_op(input[ITEMS_PER_THREAD - 1], successor_item);
}
/**
* @brief Subtracts the right element of each adjacent pair in range of
* elements partitioned across a CUDA thread block.
*
* @par
* - \rowmajor
* - \smemreuse
*
* @par Snippet
* The code snippet below illustrates how to use @p BlockAdjacentDifference to
* compute the right difference between adjacent elements.
*
* @par
* @code
* #include
* // or equivalently
*
* struct CustomDifference
* {
* template
* __device__ DataType operator()(DataType &lhs, DataType &rhs)
* {
* return lhs - rhs;
* }
* };
*
* __global__ void ExampleKernel(...)
* {
* // Specialize BlockAdjacentDifference for a 1D block of
* // 128 threads of type int
* using BlockAdjacentDifferenceT =
* cub::BlockAdjacentDifference;
*
* // Allocate shared memory for BlockDiscontinuity
* __shared__ typename BlockAdjacentDifferenceT::TempStorage temp_storage;
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[4];
* ...
*
* // Collectively compute adjacent_difference
* BlockAdjacentDifferenceT(temp_storage).SubtractRightPartialTile(
* thread_data,
* thread_data,
* CustomDifference(),
* valid_items);
*
* @endcode
* @par
* Suppose the set of input `thread_data` across the block of threads is
* `{ ...3], [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4] }`.
* and that `valid_items` is `507`. The corresponding output `result` in
* those threads will be
* `{ ..., [-1,2,1,0], [0,0,0,-1], [-1,0,3,3], [3,4,1,4] }`.
*
* @param[out] output
* Calling thread's adjacent difference result
*
* @param[in] input
* Calling thread's input items (may be aliased to @p output)
*
* @param[in] difference_op
* Binary difference operator
*
* @param[in] valid_items
* Number of valid items in thread block
*/
template
__device__ __forceinline__ void
SubtractRightPartialTile(T (&input)[ITEMS_PER_THREAD],
OutputT (&output)[ITEMS_PER_THREAD],
DifferenceOpT difference_op,
int valid_items)
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
if ((linear_tid + 1) * ITEMS_PER_THREAD < valid_items)
{
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD - 1; item++)
{
output[item] = difference_op(input[item], input[item + 1]);
}
output[ITEMS_PER_THREAD - 1] =
difference_op(input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1]);
}
else
{
#pragma unroll
for (int item = 0; item < ITEMS_PER_THREAD; item++)
{
const int idx = linear_tid * ITEMS_PER_THREAD + item;
// Right element of input[valid_items - 1] is out of bounds.
// According to the API it's copied into output array
// without modification.
if (idx < valid_items - 1)
{
output[item] = difference_op(input[item], input[item + 1]);
}
else
{
output[item] = input[item];
}
}
}
}
//@} end member group
/******************************************************************//**
* @name Head flag operations (deprecated)
*********************************************************************/
//@{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
if (linear_tid == 0)
{
// Set flag for first thread-item (preds[0] is undefined)
output[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set output for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, output, input, preds, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
*/
template
CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
output[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set output for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, output, input, preds, flag_op);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
*/
template
CUB_DEPRECATED __device__ __forceinline__ void
FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
T preds[ITEMS_PER_THREAD];
FlagHeads(output, input, preds, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
*/
template
CUB_DEPRECATED __device__ __forceinline__ void
FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
{
T preds[ITEMS_PER_THREAD];
FlagHeads(output, input, preds, flag_op, tile_predecessor_item);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
output[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set output for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, output, input, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op, ///< [in] Binary boolean flag predicate
T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
output[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set output for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, output, input, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
* cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = temp_storage.last_items[linear_tid - 1];
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
head_flags[0] = ApplyOp::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
* cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
* cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
/**
* \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
* APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
* cub::BlockAdjacentDifference::SubtractRight instead.
*/
template <
int ITEMS_PER_THREAD,
typename FlagT,
typename FlagOp>
CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags
T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags
T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items
FlagOp flag_op) ///< [in] Binary boolean flag predicate
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ?
tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(
flag_op,
preds[0],
input[0],
linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
successor_item,
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
}
};
CUB_NAMESPACE_END