/******************************************************************************
 * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/**
 * @file
 * Operations for writing linear segments of data from the CUDA warp
 */

#pragma once

#include <iterator>
#include <type_traits>

#include <cub/block/block_store.cuh>
#include <cub/config.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>
#include <cub/warp/warp_exchange.cuh>


CUB_NAMESPACE_BEGIN


/**
 * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for
 *        cub::WarpStore to write a blocked arrangement of items across a CUDA
 *        warp to a linear segment of memory.
 */
enum WarpStoreAlgorithm
{
  /**
   * @par Overview
   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
   * directly to memory.
   *
   * @par Performance Considerations
   * The utilization of memory transactions (coalescing) decreases as the
   * access stride between threads increases (i.e., the number items per thread).
   */
  WARP_STORE_DIRECT,

  /**
   * @par Overview
   * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
   * directly to memory.
   *
   * @par Performance Considerations
   * The utilization of memory transactions (coalescing) remains high regardless
   * of items written per thread.
   */
  WARP_STORE_STRIPED,

  /**
   * @par Overview
   *
   * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
   * directly to memory using CUDA's built-in vectorized stores as a coalescing
   * optimization. For example, <tt>st.global.v4.s32</tt> instructions will be
   * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
   *
   * @par Performance Considerations
   * - The utilization of memory transactions (coalescing) remains high until
   *   the the access stride between threads (i.e., the number items per thread)
   *   exceeds the maximum vector store width (typically 4 items or 64B,
   *   whichever is lower).
   * - The following conditions will prevent vectorization and writing will fall
   *   back to cub::WARP_STORE_DIRECT:
   *   - @p ITEMS_PER_THREAD is odd
   *   - The @p OutputIteratorT is not a simple pointer type
   *   - The block output offset is not quadword-aligned
   *   - The data type @p T is not a built-in primitive or CUDA vector type
   *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
   */
  WARP_STORE_VECTORIZE,

  /**
   * @par Overview
   * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
   * transposed and then efficiently written to memory as a
   * [<em>striped arrangement</em>](index.html#sec5sec3).
   *
   * @par Performance Considerations
   * - The utilization of memory transactions (coalescing) remains high
   *   regardless of items written per thread.
   * - The local reordering incurs slightly longer latencies and throughput than the
   *   direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives.
   */
  WARP_STORE_TRANSPOSE
};


/**
 * @brief The WarpStore class provides [<em>collective</em>](index.html#sec0)
 *        data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3)
 *        of items partitioned across a CUDA warp to a linear segment of memory.
 * @ingroup WarpModule
 * @ingroup UtilIo
 *
 * @tparam T
 *   The type of data to be written.
 *
 * @tparam ITEMS_PER_THREAD
 *   The number of consecutive items partitioned onto each thread.
 *
 * @tparam ALGORITHM
 *   <b>[optional]</b> cub::WarpStoreAlgorithm tuning policy enumeration.
 *   default: cub::WARP_STORE_DIRECT.
 *
 * @tparam LOGICAL_WARP_THREADS
 *   <b>[optional]</b> The number of threads per "logical" warp (may be less
 *   than the number of hardware warp threads). Default is the warp size of the
 *   targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
 *   power of two.
 *
 * @tparam PTX_ARCH
 *   <b>[optional]</b> \ptxversion
 *
 * @par Overview
 * - The WarpStore class provides a single data movement abstraction that can be
 *   specialized to implement different cub::WarpStoreAlgorithm strategies. This
 *   facilitates different performance policies for different architectures,
 *   data types, granularity sizes, etc.
 * - WarpStore can be optionally specialized by different data movement strategies:
 *   -# <b>cub::WARP_STORE_DIRECT</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
 *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
 *   -# <b>cub::WARP_STORE_STRIPED</b>. A [<em>striped arrangement</em>](index.html#sec5sec3)
 *      of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm)
 *   -# <b>cub::WARP_STORE_VECTORIZE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
 *      of data is written directly to memory using CUDA's built-in vectorized
 *      stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm)
 *   -# <b>cub::WARP_STORE_TRANSPOSE</b>. A [<em>blocked arrangement</em>](index.html#sec5sec3)
 *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3)
 *      which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm)
 * - \rowmajor
 *
 * @par A Simple Example
 * @par
 * The code snippet below illustrates the storing of a "blocked" arrangement
 * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
 * into a linear segment of memory. The store is specialized for
 * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
 * that memory references will be efficiently coalesced using a warp-striped
 * access pattern.
 * @par
 * @code
 * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
 *
 * __global__ void ExampleKernel(int *d_data, ...)
 * {
 *     constexpr int warp_threads = 16;
 *     constexpr int block_threads = 256;
 *     constexpr int items_per_thread = 4;
 *
 *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
 *     using WarpStoreT = WarpStore<int,
 *                                  items_per_thread,
 *                                  cub::WARP_STORE_TRANSPOSE,
 *                                  warp_threads>;
 *
 *     constexpr int warps_in_block = block_threads / warp_threads;
 *     constexpr int tile_size = items_per_thread * warp_threads;
 *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
 *
 *     // Allocate shared memory for WarpStore
 *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
 *
 *     // Obtain a segment of consecutive items that are blocked across threads
 *     int thread_data[4];
 *     ...
 *
 *     // Store items to linear memory
 *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
 * @endcode
 * @par
 * Suppose the set of @p thread_data across the warp threads is
 * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
 * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
 */
template <typename           T,
          int                ITEMS_PER_THREAD,
          WarpStoreAlgorithm ALGORITHM            = WARP_STORE_DIRECT,
          int                LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS,
          int                PTX_ARCH             = CUB_PTX_ARCH>
class WarpStore
{
  static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
                "LOGICAL_WARP_THREADS must be a power of two");

  constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS ==
                                       CUB_WARP_THREADS(PTX_ARCH);

private:

  /// Store helper
  template <WarpStoreAlgorithm _POLICY, int DUMMY>
  struct StoreInternal;

  template <int DUMMY>
  struct StoreInternal<WARP_STORE_DIRECT, DUMMY>
  {
    typedef NullType TempStorage;

    int linear_tid;

    __device__ __forceinline__ StoreInternal(TempStorage &/*temp_storage*/,
                                             int linear_tid)
      : linear_tid(linear_tid)
    {}

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD])
    {
      StoreDirectBlocked(linear_tid, block_itr, items);
    }

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD],
                                          int valid_items)
    {
      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
    }
  };


  template <int DUMMY>
  struct StoreInternal<WARP_STORE_STRIPED, DUMMY>
  {
    typedef NullType TempStorage;

    int linear_tid;

    __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/,
                                             int linear_tid)
        : linear_tid(linear_tid)
    {}

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD])
    {
      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
    }

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD],
                                          int valid_items)
    {
      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
                                               block_itr,
                                               items,
                                               valid_items);
    }
  };


  template <int DUMMY>
  struct StoreInternal<WARP_STORE_VECTORIZE, DUMMY>
  {
    typedef NullType TempStorage;

    int linear_tid;

    __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/,
                                             int linear_tid)
        : linear_tid(linear_tid)
    {}

    __device__ __forceinline__ void Store(T *block_ptr,
                                          T (&items)[ITEMS_PER_THREAD])
    {
      StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
    }

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD])
    {
      StoreDirectBlocked(linear_tid, block_itr, items);
    }

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD],
                                          int valid_items)
    {
      StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
    }
  };


  template <int DUMMY>
  struct StoreInternal<WARP_STORE_TRANSPOSE, DUMMY>
  {
    using WarpExchangeT =
      WarpExchange<T, ITEMS_PER_THREAD, LOGICAL_WARP_THREADS, PTX_ARCH>;

    struct _TempStorage : WarpExchangeT::TempStorage
    {};

    struct TempStorage : Uninitialized<_TempStorage> {};

    _TempStorage &temp_storage;

    int linear_tid;

    __device__ __forceinline__ StoreInternal(TempStorage &temp_storage,
                                             int linear_tid)
        : temp_storage(temp_storage.Alias())
        , linear_tid(linear_tid)
    {}

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD])
    {
      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid, block_itr, items);
    }

    template <typename OutputIteratorT>
    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                          T (&items)[ITEMS_PER_THREAD],
                                          int valid_items)
    {
      WarpExchangeT(temp_storage).BlockedToStriped(items, items);
      StoreDirectStriped<LOGICAL_WARP_THREADS>(linear_tid,
                                               block_itr,
                                               items,
                                               valid_items);
    }
  };


  /// Internal load implementation to use
  using InternalStore = StoreInternal<ALGORITHM, 0>;

  /// Shared memory storage layout type
  using _TempStorage = typename InternalStore::TempStorage;


  __device__ __forceinline__ _TempStorage& PrivateStorage()
  {
    __shared__ _TempStorage private_storage;
    return private_storage;
  }


  _TempStorage &temp_storage;

  int linear_tid;

public:

  struct TempStorage : Uninitialized<_TempStorage> {};

  /*************************************************************************//**
   * @name Collective constructors
   ****************************************************************************/
  //@{

  /**
   * @brief Collective constructor using a private static allocation of shared
   *        memory as temporary storage.
   */
  __device__ __forceinline__ WarpStore()
      : temp_storage(PrivateStorage())
      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
  {}

  /**
   * @brief Collective constructor using the specified memory allocation as
   *        temporary storage.
   */
  __device__ __forceinline__ WarpStore(TempStorage &temp_storage)
      : temp_storage(temp_storage.Alias())
      , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS))
  {}

  //@}  end member group
  /*************************************************************************//**
   * @name Data movement
   ****************************************************************************/
  //@{

  /**
   * @brief Store items into a linear segment of memory.
   *
   * @par
   * \smemreuse
   *
   * @par Snippet
   * @par
   * The code snippet below illustrates the storing of a "blocked" arrangement
   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
   * into a linear segment of memory. The store is specialized for
   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
   * that memory references will be efficiently coalesced using a warp-striped
   * access pattern.
   * @code
   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
   *
   * __global__ void ExampleKernel(int *d_data, ...)
   * {
   *     constexpr int warp_threads = 16;
   *     constexpr int block_threads = 256;
   *     constexpr int items_per_thread = 4;
   *
   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
   *     using WarpStoreT = WarpStore<int,
   *                                  items_per_thread,
   *                                  cub::WARP_STORE_TRANSPOSE,
   *                                  warp_threads>;
   *
   *     constexpr int warps_in_block = block_threads / warp_threads;
   *     constexpr int tile_size = items_per_thread * warp_threads;
   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
   *
   *     // Allocate shared memory for WarpStore
   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
   *
   *     // Obtain a segment of consecutive items that are blocked across threads
   *     int thread_data[4];
   *     ...
   *
   *     // Store items to linear memory
   *     WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
   * @endcode
   * @par
   * Suppose the set of @p thread_data across the warp threads is
   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt>.
   * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
   *
   * @param[out] block_itr The thread block's base output iterator for storing to
   * @param[in] items Data to store
   */
  template <typename OutputIteratorT>
  __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                        T (&items)[ITEMS_PER_THREAD])
  {
    InternalStore(temp_storage, linear_tid).Store(block_itr, items);
  }

  /**
   * @brief Store items into a linear segment of memory, guarded by range.
   *
   * @par
   * \smemreuse
   *
   * @par Snippet
   * @par
   * The code snippet below illustrates the storing of a "blocked" arrangement
   * of 64 integers across 16 threads (where each thread owns 4 consecutive items)
   * into a linear segment of memory. The store is specialized for
   * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so
   * that memory references will be efficiently coalesced using a warp-striped
   * access pattern.
   * @code
   * #include <cub/cub.cuh>   // or equivalently <cub/warp/warp_store.cuh>
   *
   * __global__ void ExampleKernel(int *d_data, int valid_items ...)
   * {
   *     constexpr int warp_threads = 16;
   *     constexpr int block_threads = 256;
   *     constexpr int items_per_thread = 4;
   *
   *     // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each
   *     using WarpStoreT = WarpStore<int,
   *                                  items_per_thread,
   *                                  cub::WARP_STORE_TRANSPOSE,
   *                                  warp_threads>;
   *
   *     constexpr int warps_in_block = block_threads / warp_threads;
   *     constexpr int tile_size = items_per_thread * warp_threads;
   *     const int warp_id = static_cast<int>(threadIdx.x) / warp_threads;
   *
   *     // Allocate shared memory for WarpStore
   *     __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block];
   *
   *     // Obtain a segment of consecutive items that are blocked across threads
   *     int thread_data[4];
   *     ...
   *
   *     // Store items to linear memory
   *     WarpStoreT(temp_storage[warp_id]).Store(
   *       d_data + warp_id * tile_size, thread_data, valid_items);
   * @endcode
   * @par
   * Suppose the set of @p thread_data across the warp threads is
   * <tt>{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }</tt> and @p valid_items
   * is @p 5.. The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ...</tt>,
   * with only the first two threads being unmasked to store portions of valid
   * data.
   *
   * @param[out] block_itr The thread block's base output iterator for storing to
   * @param[in] items Data to store
   * @param[in] valid_items Number of valid items to write
   */
  template <typename OutputIteratorT>
  __device__ __forceinline__ void Store(OutputIteratorT block_itr,
                                        T (&items)[ITEMS_PER_THREAD],
                                        int valid_items)
  {
    InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
  }

  //@}  end member group
};


CUB_NAMESPACE_END