/*
 * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
#pragma once

#ifndef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__ __device__
#else
#define CUBLASAPI
#endif
#endif

#include <cublas_api.h>

#include <stdint.h>
#include <stddef.h>
#include <stdio.h>

#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */

/** Opaque structure holding CUBLASLT context
 */
typedef struct cublasLtContext* cublasLtHandle_t;

cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t* lightHandle);

cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle);

const char* CUBLASWINAPI cublasLtGetStatusName(cublasStatus_t status);

const char* CUBLASWINAPI cublasLtGetStatusString(cublasStatus_t status);

size_t CUBLASWINAPI cublasLtGetVersion(void);

size_t CUBLASWINAPI cublasLtGetCudartVersion(void);

cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type, int* value);

cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheGetCapacity(size_t* capacity);
cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheSetCapacity(size_t capacity);

/** Restricts usage of CPU instructions (ISA) specified by the flags in the mask.
 *
 * Flags can be combined with bitwise OR(|) operator. Supported flags:
 * - 0x1 -- x86-64 AVX512 ISA
 *
 * Default mask: 0 (any applicable ISA is allowed).
 *
 * The function returns the previous value of the mask.
 * The function takes precedence over the environment variable CUBLASLT_DISABLE_CPU_INSTRUCTIONS_MASK.
 */
unsigned CUBLASWINAPI cublasLtDisableCpuInstructionsSetMask(unsigned mask);

/** Semi-opaque descriptor for matrix memory layout
 */
typedef struct {
  uint64_t data[8];
} cublasLtMatrixLayoutOpaque_t;

/** Opaque descriptor for matrix memory layout
 */
typedef cublasLtMatrixLayoutOpaque_t* cublasLtMatrixLayout_t;

/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes)
 *
 * This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save
 * on selecting the right configuration again.
 */
typedef struct {
  uint64_t data[8];
} cublasLtMatmulAlgo_t;

/** Semi-opaque descriptor for cublasLtMatmul() operation details
 */
typedef struct {
  uint64_t data[32];
} cublasLtMatmulDescOpaque_t;

/** Opaque descriptor for cublasLtMatmul() operation details
 */
typedef cublasLtMatmulDescOpaque_t* cublasLtMatmulDesc_t;

/** Semi-opaque descriptor for cublasLtMatrixTransform() operation details
 */
typedef struct {
  uint64_t data[8];
} cublasLtMatrixTransformDescOpaque_t;

/** Opaque descriptor for cublasLtMatrixTransform() operation details
 */
typedef cublasLtMatrixTransformDescOpaque_t* cublasLtMatrixTransformDesc_t;

/** Semi-opaque descriptor for cublasLtMatmulPreference() operation details
 */
typedef struct {
  uint64_t data[8];
} cublasLtMatmulPreferenceOpaque_t;

/** Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration
 */
typedef cublasLtMatmulPreferenceOpaque_t* cublasLtMatmulPreference_t;

/** Tile size (in C/D matrix Rows x Cols)
 *
 * General order of tile IDs is sorted by size first and by first dimension second.
 */
typedef enum {
  CUBLASLT_MATMUL_TILE_UNDEFINED = 0,
  CUBLASLT_MATMUL_TILE_8x8 = 1,
  CUBLASLT_MATMUL_TILE_8x16 = 2,
  CUBLASLT_MATMUL_TILE_16x8 = 3,
  CUBLASLT_MATMUL_TILE_8x32 = 4,
  CUBLASLT_MATMUL_TILE_16x16 = 5,
  CUBLASLT_MATMUL_TILE_32x8 = 6,
  CUBLASLT_MATMUL_TILE_8x64 = 7,
  CUBLASLT_MATMUL_TILE_16x32 = 8,
  CUBLASLT_MATMUL_TILE_32x16 = 9,
  CUBLASLT_MATMUL_TILE_64x8 = 10,
  CUBLASLT_MATMUL_TILE_32x32 = 11,
  CUBLASLT_MATMUL_TILE_32x64 = 12,
  CUBLASLT_MATMUL_TILE_64x32 = 13,
  CUBLASLT_MATMUL_TILE_32x128 = 14,
  CUBLASLT_MATMUL_TILE_64x64 = 15,
  CUBLASLT_MATMUL_TILE_128x32 = 16,
  CUBLASLT_MATMUL_TILE_64x128 = 17,
  CUBLASLT_MATMUL_TILE_128x64 = 18,
  CUBLASLT_MATMUL_TILE_64x256 = 19,
  CUBLASLT_MATMUL_TILE_128x128 = 20,
  CUBLASLT_MATMUL_TILE_256x64 = 21,
  CUBLASLT_MATMUL_TILE_64x512 = 22,
  CUBLASLT_MATMUL_TILE_128x256 = 23,
  CUBLASLT_MATMUL_TILE_256x128 = 24,
  CUBLASLT_MATMUL_TILE_512x64 = 25,
  CUBLASLT_MATMUL_TILE_64x96 = 26,
  CUBLASLT_MATMUL_TILE_96x64 = 27,
  CUBLASLT_MATMUL_TILE_96x128 = 28,
  CUBLASLT_MATMUL_TILE_128x160 = 29,
  CUBLASLT_MATMUL_TILE_160x128 = 30,
  CUBLASLT_MATMUL_TILE_192x128 = 31,
  CUBLASLT_MATMUL_TILE_128x192 = 32,
  CUBLASLT_MATMUL_TILE_128x96 = 33,
  CUBLASLT_MATMUL_TILE_32x256 = 34,
  CUBLASLT_MATMUL_TILE_256x32 = 35,
  CUBLASLT_MATMUL_TILE_END
} cublasLtMatmulTile_t;

/** Size and number of stages in which elements are read into shared memory
 *
 * General order of stages IDs is sorted by stage size first and by number of stages second.
 */
typedef enum {
  CUBLASLT_MATMUL_STAGES_UNDEFINED = 0,
  CUBLASLT_MATMUL_STAGES_16x1 = 1,
  CUBLASLT_MATMUL_STAGES_16x2 = 2,
  CUBLASLT_MATMUL_STAGES_16x3 = 3,
  CUBLASLT_MATMUL_STAGES_16x4 = 4,
  CUBLASLT_MATMUL_STAGES_16x5 = 5,
  CUBLASLT_MATMUL_STAGES_16x6 = 6,
  CUBLASLT_MATMUL_STAGES_32x1 = 7,
  CUBLASLT_MATMUL_STAGES_32x2 = 8,
  CUBLASLT_MATMUL_STAGES_32x3 = 9,
  CUBLASLT_MATMUL_STAGES_32x4 = 10,
  CUBLASLT_MATMUL_STAGES_32x5 = 11,
  CUBLASLT_MATMUL_STAGES_32x6 = 12,
  CUBLASLT_MATMUL_STAGES_64x1 = 13,
  CUBLASLT_MATMUL_STAGES_64x2 = 14,
  CUBLASLT_MATMUL_STAGES_64x3 = 15,
  CUBLASLT_MATMUL_STAGES_64x4 = 16,
  CUBLASLT_MATMUL_STAGES_64x5 = 17,
  CUBLASLT_MATMUL_STAGES_64x6 = 18,
  CUBLASLT_MATMUL_STAGES_128x1 = 19,
  CUBLASLT_MATMUL_STAGES_128x2 = 20,
  CUBLASLT_MATMUL_STAGES_128x3 = 21,
  CUBLASLT_MATMUL_STAGES_128x4 = 22,
  CUBLASLT_MATMUL_STAGES_128x5 = 23,
  CUBLASLT_MATMUL_STAGES_128x6 = 24,
  CUBLASLT_MATMUL_STAGES_32x10 = 25,
  CUBLASLT_MATMUL_STAGES_8x4 = 26,
  CUBLASLT_MATMUL_STAGES_16x10 = 27,
  CUBLASLT_MATMUL_STAGES_8x5 = 28,
  CUBLASLT_MATMUL_STAGES_8x3 = 31,
  CUBLASLT_MATMUL_STAGES_8xAUTO = 32,
  CUBLASLT_MATMUL_STAGES_16xAUTO = 33,
  CUBLASLT_MATMUL_STAGES_32xAUTO = 34,
  CUBLASLT_MATMUL_STAGES_64xAUTO = 35,
  CUBLASLT_MATMUL_STAGES_128xAUTO = 36,
  CUBLASLT_MATMUL_STAGES_END
} cublasLtMatmulStages_t;

/** Thread Block Cluster size
 *
 * Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time.
 */
typedef enum {
  /** Let library pick cluster shape automatically */
  CUBLASLT_CLUSTER_SHAPE_AUTO = 0,
  CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2,
  CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3,
  CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4,
  CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5,
  CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6,
  CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7,
  CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8,
  CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9,
  CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10,
  CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11,
  CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12,
  CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13,
  CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14,
  CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15,
  CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16,
  CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17,
  CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18,
  CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19,
  CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20,
  CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21,
  CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22,
  CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23,
  CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24,
  CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25,
  CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26,
  CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27,
  CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28,
  CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29,
  CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30,
  CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31,
  CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32,
  CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33,
  CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34,
  CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35,
  CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36,
  CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37,
  CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38,
  CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39,
  CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40,
  CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41,
  CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42,
  CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43,
  CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44,
  CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45,
  CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46,
  CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47,
  CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48,
  CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49,
  CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50,
  CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51,
  CUBLASLT_CLUSTER_SHAPE_END
} cublasLtClusterShape_t;

/** Inner size of the kernel
 *
 * Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle
 * effects.
 *
 */
typedef enum {
  CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0,
  CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1,
  CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2,
  CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3,
  CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4,
  CUBLASLT_MATMUL_INNER_SHAPE_END
} cublasLtMatmulInnerShape_t;

/** Pointer mode to use for alpha/beta */
typedef enum {
  /** matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory */
  CUBLASLT_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST,
  /** matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory */
  CUBLASLT_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE,
  /** pointer targets an array in device memory */
  CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2,
  /** alpha pointer targets an array in device memory, beta is zero. Note:
     CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0. */
  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3,
  /** alpha pointer targets an array in device memory, beta is a single value in host memory. */
  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4,
} cublasLtPointerMode_t;

/** Mask to define pointer mode capability */
typedef enum {
  /** see CUBLASLT_POINTER_MODE_HOST */
  CUBLASLT_POINTER_MODE_MASK_HOST = 1,
  /** see CUBLASLT_POINTER_MODE_DEVICE */
  CUBLASLT_POINTER_MODE_MASK_DEVICE = 2,
  /** see CUBLASLT_POINTER_MODE_DEVICE_VECTOR */
  CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4,
  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO */
  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8,
  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST */
  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16,
} cublasLtPointerModeMask_t;

/** Implementation details that may affect numerical behavior of algorithms. */
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA (0x01ull << 0)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA (0x02ull << 0)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA (0x04ull << 0)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA (0x08ull << 0)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK (0xfeull << 0)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK (0xffull << 0)

#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F (0x01ull << 8)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F (0x02ull << 8)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F (0x04ull << 8)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I (0x08ull << 8)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK (0xffull << 8)

#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F (0x01ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF (0x02ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32 (0x04ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F (0x08ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F (0x10ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I (0x20ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3 (0x40ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2 (0x80ull << 16)
#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK (0xffull << 16)

#define CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN (0x01ull << 32)
typedef uint64_t cublasLtNumericalImplFlags_t;

/** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
 *
 * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
 * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
 *                                             when workspaceSizeInBytes is less than workspace required by configured
 *                                             algo
 * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
 *                                             operation
 * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
 * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
 * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmul(cublasLtHandle_t lightHandle,
                                           cublasLtMatmulDesc_t computeDesc,
                                           const void* alpha, /* host or device pointer */
                                           const void* A,
                                           cublasLtMatrixLayout_t Adesc,
                                           const void* B,
                                           cublasLtMatrixLayout_t Bdesc,
                                           const void* beta, /* host or device pointer */
                                           const void* C,
                                           cublasLtMatrixLayout_t Cdesc,
                                           void* D,
                                           cublasLtMatrixLayout_t Ddesc,
                                           const cublasLtMatmulAlgo_t* algo,
                                           void* workspace,
                                           size_t workspaceSizeInBytes,
                                           cudaStream_t stream);

/** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
 *
 * Can be used to change memory order of data or to scale and shift the values.
 *
 * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
 * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
 *                                             when A is not NULL, but Adesc is NULL
 * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
 *                                             operation
 * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
 * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
 * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(cublasLtHandle_t lightHandle,
                                                    cublasLtMatrixTransformDesc_t transformDesc,
                                                    const void* alpha, /* host or device pointer */
                                                    const void* A,
                                                    cublasLtMatrixLayout_t Adesc,
                                                    const void* beta, /* host or device pointer */
                                                    const void* B,
                                                    cublasLtMatrixLayout_t Bdesc,
                                                    void* C,
                                                    cublasLtMatrixLayout_t Cdesc,
                                                    cudaStream_t stream);

/* ---------------------------------------------------------------------------------------*/
/* Helper functions for cublasLtMatrixLayout_t */
/* ---------------------------------------------------------------------------------------*/

/** Enum for data ordering */
typedef enum {
  /** Column-major
   *
   * Leading dimension is the stride (in elements) to the beginning of next column in memory.
   */
  CUBLASLT_ORDER_COL = 0,
  /** Row major
   *
   * Leading dimension is the stride (in elements) to the beginning of next row in memory.
   */
  CUBLASLT_ORDER_ROW = 1,
  /** Column-major ordered tiles of 32 columns.
   *
   * Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33
   * columns and 2 rows, ld must be at least (32) * 2 = 64.
   */
  CUBLASLT_ORDER_COL32 = 2,
  /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved
   * inner tiles of 4 columns within 4 even or odd rows in an alternating pattern.
   *
   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next
   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256.
   */
  CUBLASLT_ORDER_COL4_4R2_8C = 3,
  /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows.
   * Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col.
   *
   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next
   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024.
   */
  CUBLASLT_ORDER_COL32_2R_4R4 = 4,

} cublasLtOrder_t;

/** Attributes of memory layout */
typedef enum {
  /** Data type, see cudaDataType.
   *
   * uint32_t
   */
  CUBLASLT_MATRIX_LAYOUT_TYPE = 0,

  /** Memory order of the data, see cublasLtOrder_t.
   *
   * int32_t, default: CUBLASLT_ORDER_COL
   */
  CUBLASLT_MATRIX_LAYOUT_ORDER = 1,

  /** Number of rows.
   *
   * Usually only values that can be expressed as int32_t are supported.
   *
   * uint64_t
   */
  CUBLASLT_MATRIX_LAYOUT_ROWS = 2,

  /** Number of columns.
   *
   * Usually only values that can be expressed as int32_t are supported.
   *
   * uint64_t
   */
  CUBLASLT_MATRIX_LAYOUT_COLS = 3,

  /** Matrix leading dimension.
   *
   * For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for
   * other memory orders see documentation for cublasLtOrder_t values.
   *
   * Currently only non-negative values are supported, must be large enough so that matrix memory locations are not
   * overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL).
   *
   * int64_t;
   */
  CUBLASLT_MATRIX_LAYOUT_LD = 4,

  /** Number of matmul operations to perform in the batch.
   *
   * See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT
   *
   * int32_t, default: 1
   */
  CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5,

  /** Stride (in elements) to the next matrix for strided batch operation.
   *
   * When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride
   * is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F,
   * offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices
   * is a 2B (16bit) floating point type).
   *
   * NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix
   * as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride
   * value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B
   * each). This behavior is expected to be corrected in the next major cuBLAS version.
   *
   * int64_t, default: 0
   */
  CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6,

  /** Stride (in bytes) to the imaginary plane for planar complex layout.
   *
   * int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved
   * in memory in each element)
   */
  CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7,
} cublasLtMatrixLayoutAttribute_t;

/** Internal. Do not use directly.
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
    cublasLtMatrixLayout_t matLayout,
    size_t size,
    cudaDataType type,
    uint64_t rows,
    uint64_t cols,
    int64_t ld);

/** Initialize matrix layout descriptor in pre-allocated space.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
static inline cublasStatus_t cublasLtMatrixLayoutInit(
    cublasLtMatrixLayout_t matLayout, cudaDataType type, uint64_t rows, uint64_t cols, int64_t ld) {
  return cublasLtMatrixLayoutInit_internal(matLayout, sizeof(*matLayout), type, rows, cols, ld);
}

/** Create new matrix layout descriptor.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
    cublasLtMatrixLayout_t* matLayout,
    cudaDataType type,
    uint64_t rows,
    uint64_t cols,
    int64_t ld);

/** Destroy matrix layout descriptor.
 *
 * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout);

/** Set matrix layout descriptor attribute.
 *
 * \param[in]  matLayout    The descriptor
 * \param[in]  attr         The attribute
 * \param[in]  buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
    cublasLtMatrixLayout_t matLayout,
    cublasLtMatrixLayoutAttribute_t attr,
    const void* buf,
    size_t sizeInBytes);

/** Get matrix layout descriptor attribute.
 *
 * \param[in]  matLayout    The descriptor
 * \param[in]  attr         The attribute
 * \param[out] buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
 *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
    cublasLtMatrixLayout_t matLayout,
    cublasLtMatrixLayoutAttribute_t attr,
    void* buf,
    size_t sizeInBytes,
    size_t* sizeWritten);

/* ---------------------------------------------------------------------------------------*/
/* Helper functions for cublasLtMatmulDesc_t */
/* ---------------------------------------------------------------------------------------*/

/** Matmul descriptor attributes to define details of the operation. */
typedef enum {
  /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the
   * accumulator during matrix multiplication.
   *
   * int32_t
   */
  CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0,

  /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are
   * typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix
   * D before being stored in memory.
   *
   * int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE
   */
  CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1,

  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use,
   * alpha/beta vector lenghts must match number of output matrix rows.
   *
   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
   */
  CUBLASLT_MATMUL_DESC_POINTER_MODE = 2,

  /** Transform of matrix A, see cublasOperation_t.
   *
   * int32_t, default: CUBLAS_OP_N
   */
  CUBLASLT_MATMUL_DESC_TRANSA = 3,

  /** Transform of matrix B, see cublasOperation_t.
   *
   * int32_t, default: CUBLAS_OP_N
   */
  CUBLASLT_MATMUL_DESC_TRANSB = 4,

  /** Transform of matrix C, see cublasOperation_t.
   *
   * Currently only CUBLAS_OP_N is supported.
   *
   * int32_t, default: CUBLAS_OP_N
   */
  CUBLASLT_MATMUL_DESC_TRANSC = 5,

  /** Matrix fill mode, see cublasFillMode_t.
   *
   * int32_t, default: CUBLAS_FILL_MODE_FULL
   */
  CUBLASLT_MATMUL_DESC_FILL_MODE = 6,

  /** Epilogue function, see cublasLtEpilogue_t.
   *
   * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE = 7,

  /** Bias or bias gradient vector pointer in the device memory.
   *
   * Bias case. See CUBLASLT_EPILOGUE_BIAS.
   * For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE.
   *
   * Bias vector length must match matrix D rows count.
   *
   * Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD.
   * Bias gradient vector elements are the same type as the output elements
   * (Ctype) with the exception of IMMA kernels (see above).
   *
   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
   * depend on its value to determine expected pointer alignment.
   *
   * Bias case: const void *, default: NULL
   * Bias gradient case: void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8,

  /** Batch stride for bias or bias gradient vector.
   *
   * Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1.
   *
   * int64_t, default: 0
   */
  CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10,

  /** Pointer for epilogue auxiliary buffer.
   *
   * - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX
   *   or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used.
   * - Input vector for ReLu bit-mask in backward pass when
   *   CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used.
   *
   * - Output of GELU input matrix in forward pass when
   *   CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used.
   * - Input of GELU input matrix for backward pass when
   *   CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used.
   *
   * For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE.
   *
   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
   * depend on its value to determine expected pointer alignment.
   *
   * Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute.
   *
   * Forward pass: void *, default: NULL
   * Backward pass: const void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11,

  /** Leading dimension for epilogue auxiliary buffer.
   *
   * - ReLu bit-mask matrix leading dimension in elements (i.e. bits)
   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
   * used. Must be divisible by 128 and be no less than the number of rows in the output matrix.
   *
   * - GELU input matrix leading dimension in elements
   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
   *   Must be divisible by 8 and be no less than the number of rows in the output matrix.
   *
   * int64_t, default: 0
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12,

  /** Batch stride for epilogue auxiliary buffer.
   *
   * - ReLu bit-mask matrix batch stride in elements (i.e. bits)
   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
   * used. Must be divisible by 128.
   *
   * - GELU input matrix batch stride in elements
   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
   *   Must be divisible by 8.
   *
   * int64_t, default: 0
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13,

  /** Batch stride for alpha vector.
   *
   * Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's
   * CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then
   * CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector.
   *
   * int64_t, default: 0
   */
  CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14,

  /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
   *  when user expects a concurrent stream to be using some of the device resources.
   *
   *  int32_t, default: 0 - use the number reported by the device.
   */
  CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15,

  /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range.
   *
   *  The scaling factor value must have the same type as the compute type.
   *
   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
   *
   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
   *  will return CUBLAS_INVALID_VALUE.
   *
   *  const void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17,

  /** Device pointer to the scale factor value to convert data in matrix B to compute data type range.
   *
   *  The scaling factor value must have the same type as the compute type.
   *
   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
   *
   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
   *  will return CUBLAS_INVALID_VALUE.
   *
   *  const void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18,

  /** Device pointer to the scale factor value to convert data in matrix C to compute data type range.
   *
   *  The scaling factor value must have the same type as the compute type.
   *
   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
   *
   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
   *  will return CUBLAS_INVALID_VALUE.
   *
   *  const void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19,

  /** Device pointer to the scale factor value to convert data in matrix D to compute data type range.
   *
   *  The scaling factor value must have the same type as the compute type.
   *
   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
   *
   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
   *  will return CUBLAS_INVALID_VALUE.
   *
   *  const void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20,

  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
   *  output matrix.
   *
   *  The computed value has the same type as the compute type.
   *
   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
   *
   *  void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21,

  /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   *
   *  If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details
   * below.
   *
   *  ReLu uses a bit-mask.
   *
   *  GELU input matrix elements type is the same as the type of elements of
   *  the output matrix with some exceptions, see details below.
   *
   *  For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some
   *  restrictions.  See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details.
   *
   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
   *  will return CUBLAS_INVALID_VALUE.
   *
   *  int32_t based on cudaDataType, default: -1
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22,

  /** Device pointer to the scaling factor value to convert results from compute type data range to storage
   *  data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   *
   *  The scaling factor value must have the same type as the compute type.
   *
   *  If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data,
   *  scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
   *
   *  void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23,

  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
   *  buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   *
   *  The computed value has the same type as the compute type.
   *
   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
   *
   *  void *, default: NULL
   */
  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24,

  /** Flag for managing fp8 fast accumulation mode.
   *  When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results
   *  will not periodically be promoted to a higher precision.
   *
   *  int8_t, default: 0 - fast accumulation mode is disabled.
   */
  CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25,

  /** Type of bias or bias gradient vector in the device memory.
   *
   * Bias case: see CUBLASLT_EPILOGUE_BIAS.
   *
   * Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions:
   * - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements
   *   are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F)
   * - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See
   *   https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details.
   *
   * int32_t based on cudaDataType, default: -1
   */
  CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26,

  /** EXPERIMENTAL: Number of atomic synchronization chunks in the row dimension of the output matrix D.
   *
   * int32_t, default 0 (atomic synchronization disabled)
   */
  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = 27,

  /** EXPERIMENTAL: Number of atomic synchronization chunks in the column dimension of the output matrix D.
   *
   * int32_t, default 0 (atomic synchronization disabled)
   */
  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS = 28,

  /** EXPERIMENTAL: Pointer to a device array of input atomic counters consumed by a matmul.
   *
   * int32_t *, default: NULL
   * */
  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER = 29,

  /** EXPERIMENTAL: Pointer to a device array of output atomic counters produced by a matmul.
   *
   * int32_t *, default: NULL
   * */
  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER = 30,
} cublasLtMatmulDescAttributes_t;

/** Internal. Do not use directly.
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
    cublasLtMatmulDesc_t matmulDesc,
    size_t size,
    cublasComputeType_t computeType,
    cudaDataType_t scaleType);

/** Initialize matmul operation descriptor in pre-allocated space.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was initialized successfully
 */
static inline cublasStatus_t cublasLtMatmulDescInit(  //
    cublasLtMatmulDesc_t matmulDesc,
    cublasComputeType_t computeType,
    cudaDataType_t scaleType) {
  return cublasLtMatmulDescInit_internal(matmulDesc, sizeof(*matmulDesc), computeType, scaleType);
}

/** Create new matmul operation descriptor.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(cublasLtMatmulDesc_t* matmulDesc,
                                                     cublasComputeType_t computeType,
                                                     cudaDataType_t scaleType);

/** Destroy matmul operation descriptor.
 *
 * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc);

/** Set matmul operation descriptor attribute.
 *
 * \param[in]  matmulDesc   The descriptor
 * \param[in]  attr         The attribute
 * \param[in]  buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
    cublasLtMatmulDesc_t matmulDesc,
    cublasLtMatmulDescAttributes_t attr,
    const void* buf,
    size_t sizeInBytes);

/** Get matmul operation descriptor attribute.
 *
 * \param[in]  matmulDesc   The descriptor
 * \param[in]  attr         The attribute
 * \param[out] buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
 *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
    cublasLtMatmulDesc_t matmulDesc,
    cublasLtMatmulDescAttributes_t attr,
    void* buf,
    size_t sizeInBytes,
    size_t* sizeWritten);

/* ---------------------------------------------------------------------------------------*/
/* Helper functions for cublasLtMatrixTransformDesc_t */
/* ---------------------------------------------------------------------------------------*/

/** Matrix transform descriptor attributes to define details of the operation.
 */
typedef enum {
  /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then
   * converted to output type to store in memory.
   *
   * int32_t
   */
  CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,

  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t.
   *
   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
   */
  CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,

  /** Transform of matrix A, see cublasOperation_t.
   *
   * int32_t, default: CUBLAS_OP_N
   */
  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,

  /** Transform of matrix B, see cublasOperation_t.
   *
   * int32_t, default: CUBLAS_OP_N
   */
  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB,
} cublasLtMatrixTransformDescAttributes_t;

/** Internal. Do not use directly.
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc,
                                                                     size_t size,
                                                                     cudaDataType scaleType);

/** Initialize matrix transform operation descriptor in pre-allocated space.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
static inline cublasStatus_t cublasLtMatrixTransformDescInit(cublasLtMatrixTransformDesc_t transformDesc,
                                                             cudaDataType scaleType) {
  return cublasLtMatrixTransformDescInit_internal(transformDesc, sizeof(*transformDesc), scaleType);
}

/** Create new matrix transform operation descriptor.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t* transformDesc,
                                                              cudaDataType scaleType);

/** Destroy matrix transform operation descriptor.
 *
 * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc);

/** Set matrix transform operation descriptor attribute.
 *
 * \param[in]  transformDesc  The descriptor
 * \param[in]  attr           The attribute
 * \param[in]  buf            memory address containing the new value
 * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
    cublasLtMatrixTransformDesc_t transformDesc,
    cublasLtMatrixTransformDescAttributes_t attr,
    const void* buf,
    size_t sizeInBytes);

/** Get matrix transform operation descriptor attribute.
 *
 * \param[in]  transformDesc  The descriptor
 * \param[in]  attr           The attribute
 * \param[out] buf            memory address containing the new value
 * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
 * of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
    cublasLtMatrixTransformDesc_t transformDesc,
    cublasLtMatrixTransformDescAttributes_t attr,
    void* buf,
    size_t sizeInBytes,
    size_t* sizeWritten);

/** Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K").
 */
typedef enum {
  /** No reduction scheme, dot-product shall be performed in one sequence.
   */
  CUBLASLT_REDUCTION_SCHEME_NONE = 0,

  /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to
   * guarantee the sequentiality.
   */
  CUBLASLT_REDUCTION_SCHEME_INPLACE = 1,

  /** Intermediate results are stored in compute type in the workspace and reduced in a separate step.
   */
  CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2,

  /** Intermediate results are stored in output type in the workspace and reduced in a separate step.
   */
  CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4,

  CUBLASLT_REDUCTION_SCHEME_MASK = 0x7,
} cublasLtReductionScheme_t;

/** Postprocessing options for the epilogue
 */
typedef enum {
  /** No special postprocessing, just scale and quantize results if necessary.
   */
  CUBLASLT_EPILOGUE_DEFAULT = 1,

  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
   */
  CUBLASLT_EPILOGUE_RELU = 2,

  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
   *
   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_RELU_AUX = (CUBLASLT_EPILOGUE_RELU | 128),

  /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed
   * (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final
   * postprocessing.
   */
  CUBLASLT_EPILOGUE_BIAS = 4,

  /** ReLu and Bias, apply Bias and then ReLu transform
   */
  CUBLASLT_EPILOGUE_RELU_BIAS = (CUBLASLT_EPILOGUE_RELU | CUBLASLT_EPILOGUE_BIAS),

  /** ReLu and Bias, apply Bias and then ReLu transform
   *
   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_RELU_AUX_BIAS = (CUBLASLT_EPILOGUE_RELU_AUX | CUBLASLT_EPILOGUE_BIAS),

  /* ReLu gradient. Apply ReLu gradient to matmul output. Store ReLu gradient in the output matrix.
   *
   * This epilogue mode requires an extra input,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_DRELU = 8 | 128,

  /* ReLu and Bias gradients. Apply independently ReLu and Bias gradient to
   * matmul output. Store ReLu gradient in the output matrix, and Bias gradient
   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
   *
   * This epilogue mode requires an extra input,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_DRELU_BGRAD = CUBLASLT_EPILOGUE_DRELU | 16,

  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
   */
  CUBLASLT_EPILOGUE_GELU = 32,

  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
   *
   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_GELU_AUX = (CUBLASLT_EPILOGUE_GELU | 128),

  /** GELU and Bias, apply Bias and then GELU transform
   */
  CUBLASLT_EPILOGUE_GELU_BIAS = (CUBLASLT_EPILOGUE_GELU | CUBLASLT_EPILOGUE_BIAS),

  /** GELU and Bias, apply Bias and then GELU transform
   *
   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_GELU_AUX_BIAS = (CUBLASLT_EPILOGUE_GELU_AUX | CUBLASLT_EPILOGUE_BIAS),

  /* GELU gradient. Apply GELU gradient to matmul output. Store GELU gradient in the output matrix.
   *
   * This epilogue mode requires an extra input,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_DGELU = 64 | 128,

  /* GELU and Bias gradients. Apply independently GELU and Bias gradient to
   * matmul output. Store GELU gradient in the output matrix, and Bias gradient
   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
   *
   * This epilogue mode requires an extra input,
   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
   */
  CUBLASLT_EPILOGUE_DGELU_BGRAD = CUBLASLT_EPILOGUE_DGELU | 16,

  /** Bias gradient based on the input matrix A.
   *
   * The bias size corresponds to the number of rows of the matrix D.
   * The reduction happens over the GEMM's "k" dimension.
   *
   * Stores Bias gradient in the auxiliary output
   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
   */
  CUBLASLT_EPILOGUE_BGRADA = 256,

  /** Bias gradient based on the input matrix B.
   *
   * The bias size corresponds to the number of columns of the matrix D.
   * The reduction happens over the GEMM's "k" dimension.
   *
   * Stores Bias gradient in the auxiliary output
   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
   */
  CUBLASLT_EPILOGUE_BGRADB = 512,
} cublasLtEpilogue_t;

/** Matmul heuristic search mode
 */
typedef enum {
  /** ask heuristics for best algo for given usecase
   */
  CUBLASLT_SEARCH_BEST_FIT = 0,
  /** only try to find best config for preconfigured algo id
   */
  CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1,
  /** reserved for future use
   */
  CUBLASLT_SEARCH_RESERVED_02 = 2,
  /** reserved for future use
   */
  CUBLASLT_SEARCH_RESERVED_03 = 3,
  /** reserved for future use
   */
  CUBLASLT_SEARCH_RESERVED_04 = 4,
  /** reserved for future use
   */
  CUBLASLT_SEARCH_RESERVED_05 = 5,
} cublasLtMatmulSearch_t;

/** Algo search preference to fine tune the heuristic function. */
typedef enum {
  /** Search mode, see cublasLtMatmulSearch_t.
   *
   * uint32_t, default: CUBLASLT_SEARCH_BEST_FIT
   */
  CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0,

  /** Maximum allowed workspace size in bytes.
   *
   * uint64_t, default: 0 - no workspace allowed
   */
  CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1,

  /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that
   * use one of the required modes.
   *
   * E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes.
   *
   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes)
   */
  CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3,

  /** Minimum buffer alignment for matrix A (in bytes).
   *
   * Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned
   * as they need.
   *
   * uint32_t, default: 256
   */
  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5,

  /** Minimum buffer alignment for matrix B (in bytes).
   *
   * Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned
   * as they need.
   *
   * uint32_t, default: 256
   */
  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6,

  /** Minimum buffer alignment for matrix C (in bytes).
   *
   * Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned
   * as they need.
   *
   * uint32_t, default: 256
   */
  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7,

  /** Minimum buffer alignment for matrix D (in bytes).
   *
   * Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned
   * as they need.
   *
   * uint32_t, default: 256
   */
  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8,

  /** Maximum wave count.
   *
   * See cublasLtMatmulHeuristicResult_t::wavesCount.
   *
   * Selecting a non-zero value will exclude algorithms that report device utilization higher than specified.
   *
   * float, default: 0.0f
   */
  CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9,

  /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include
   * algorithms that use the allowed implementations.
   *
   * uint64_t, default: uint64_t(-1) (allow everything)
   */
  CUBLASLT_MATMUL_PREF_IMPL_MASK = 12,
} cublasLtMatmulPreferenceAttributes_t;

/** Internal. Do not use directly.
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size);

/** Initialize matmul heuristic search preference descriptor in pre-allocated space.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
static inline cublasStatus_t cublasLtMatmulPreferenceInit(cublasLtMatmulPreference_t pref) {
  return cublasLtMatmulPreferenceInit_internal(pref, sizeof(*pref));
}

/** Create new matmul heuristic search preference descriptor.
 *
 * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
 * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t* pref);

/** Destroy matmul heuristic search preference descriptor.
 *
 * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref);

/** Set matmul heuristic search preference descriptor attribute.
 *
 * \param[in]  pref         The descriptor
 * \param[in]  attr         The attribute
 * \param[in]  buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
    cublasLtMatmulPreference_t pref,
    cublasLtMatmulPreferenceAttributes_t attr,
    const void* buf,
    size_t sizeInBytes);

/** Get matmul heuristic search preference descriptor attribute.
 *
 * \param[in]  pref         The descriptor
 * \param[in]  attr         The attribute
 * \param[out] buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
 *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
    cublasLtMatmulPreference_t pref,
    cublasLtMatmulPreferenceAttributes_t attr,
    void* buf,
    size_t sizeInBytes,
    size_t* sizeWritten);

/** Results structure used by cublasLtMatmulGetAlgo.
 *
 * Holds returned configured algo descriptor and its runtime properties.
 */
typedef struct {
  /** Matmul algorithm descriptor.
   *
   * Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to
   * CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID
   */
  cublasLtMatmulAlgo_t algo;

  /** Actual size of workspace memory required.
   */
  size_t workspaceSize;

  /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to
   * CUBLAS_STATUS_SUCCESS.
   */
  cublasStatus_t state;

  /** Waves count - a device utilization metric.
   *
   * wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU.
   */
  float wavesCount;

  int reserved[4];
} cublasLtMatmulHeuristicResult_t;

/** Query cublasLt heuristic for algorithm appropriate for given use case.
 *
 * \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
 *                                        context. See cublasLtHandle_t.
 * \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
 * \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
 * \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
 * \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
 * \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
 * \param[in]      preference             Pointer to the structure holding the heuristic search
 *                                        preferences descriptor. See cublasLtMatrixLayout_t.
 * \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
 *                                        maximum number of algorithms to return.
 * \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
 *                                        ordered in increasing estimated compute time.
 * \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
 *
 * \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
 * \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
 * \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
 *                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
 *                                        for detail status of results
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(cublasLtHandle_t lightHandle,
                                                           cublasLtMatmulDesc_t operationDesc,
                                                           cublasLtMatrixLayout_t Adesc,
                                                           cublasLtMatrixLayout_t Bdesc,
                                                           cublasLtMatrixLayout_t Cdesc,
                                                           cublasLtMatrixLayout_t Ddesc,
                                                           cublasLtMatmulPreference_t preference,
                                                           int requestedAlgoCount,
                                                           cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
                                                           int* returnAlgoCount);

/* ---------------------------------------------------------------------------------------*/
/* Lower level API to be able to implement own Heuristic and Find routines                */
/* ---------------------------------------------------------------------------------------*/

/** Routine to get all algo IDs that can potentially run
 *
 * \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
 * (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
 * actually written
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
 * \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
 *                                          available
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(cublasLtHandle_t lightHandle,
                                                     cublasComputeType_t computeType,
                                                     cudaDataType_t scaleType,
                                                     cudaDataType_t Atype,
                                                     cudaDataType_t Btype,
                                                     cudaDataType_t Ctype,
                                                     cudaDataType_t Dtype,
                                                     int requestedAlgoCount,
                                                     int algoIdsArray[],
                                                     int* returnAlgoCount);

/** Initialize algo structure
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
 * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
 * \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(cublasLtHandle_t lightHandle,
                                                   cublasComputeType_t computeType,
                                                   cudaDataType_t scaleType,
                                                   cudaDataType_t Atype,
                                                   cudaDataType_t Btype,
                                                   cudaDataType_t Ctype,
                                                   cudaDataType_t Dtype,
                                                   int algoId,
                                                   cublasLtMatmulAlgo_t* algo);

/** Check configured algo descriptor for correctness and support on current device.
 *
 * Result includes required workspace size and calculated wave count.
 *
 * CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
 * but if cublasLtMatmulAlgoCheck fails, the algo will not run.
 *
 * \param[in]  algo    algo configuration to check
 * \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
 *                                          descriptor
 * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
 *                                          given device
 * \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
 * \retval     CUBLAS_STATUS_SUCCESS        if check was successful
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
    cublasLtHandle_t lightHandle,
    cublasLtMatmulDesc_t operationDesc,
    cublasLtMatrixLayout_t Adesc,
    cublasLtMatrixLayout_t Bdesc,
    cublasLtMatrixLayout_t Cdesc,
    cublasLtMatrixLayout_t Ddesc,
    const cublasLtMatmulAlgo_t* algo,  ///< may point to result->algo
    cublasLtMatmulHeuristicResult_t* result);

/** Capabilities Attributes that can be retrieved from an initialized Algo structure
 */
typedef enum {
  /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM
   *
   * int32_t, 0 means no support, supported otherwise
   */
  CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0,

  /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is
   * not masked out it is supported.
   *
   * e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) ==
   * CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0;
   *
   * uint32_t
   */
  CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1,

  /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
   *
   * uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved
   */
  CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2,

  /** support strided batch
   *
   * int32_t, 0 means no support, supported otherwise
   */
  CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3,

  /** support results out of place (D != C in D = alpha.A.B + beta.C)
   *
   * int32_t, 0 means no support, supported otherwise
   */
  CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4,

  /** syrk/herk support (on top of regular gemm)
   *
   * int32_t, 0 means no support, supported otherwise
   */
  CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5,

  /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use
   * CUBLASLT_MATMUL_TILE_UNDEFINED
   *
   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
   *
   * array of uint32_t
   */
  CUBLASLT_ALGO_CAP_TILE_IDS = 6,

  /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see
   * CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
   *
   * int32_t
   */
  CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7,

  /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t
   *
   * int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different
   * requirements;
   */
  CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10,

  /** bitmask enumerating pointer modes algorithm supports
   *
   * uint32_t, see cublasLtPointerModeMask_t
   */
  CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11,

  /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue
   *
   * uint32_t, see cublasLtEpilogue_t
   */
  CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12,

  /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use
   * CUBLASLT_MATMUL_STAGES_UNDEFINED
   *
   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
   *
   * array of uint32_t
   */
  CUBLASLT_ALGO_CAP_STAGES_IDS = 13,

  /** support for nagative ld for all of the matrices
   *
   * int32_t 0 means no support, supported otherwise
   */
  CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14,

  /** details about algorithm's implementation that affect it's numerical behavior
   *
   * uint64_t, see cublasLtNumericalImplFlags_t
   */
  CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15,

  /** minimum alignment required for A matrix in bytes
   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
   *
   * uint32_t
   */
  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16,

  /** minimum alignment required for B matrix in bytes
   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
   *
   * uint32_t
   */
  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17,

  /** minimum alignment required for C matrix in bytes
   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
   *
   * uint32_t
   */
  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18,

  /** minimum alignment required for D matrix in bytes
   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
   *
   * uint32_t
   */
  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19,

  /** EXPERIMENTAL: support for synchronization via atomic counters
   *
   * int32_t
   */
  CUBLASLT_ALGO_CAP_ATOMIC_SYNC = 20,
} cublasLtMatmulAlgoCapAttributes_t;

/** Get algo capability attribute.
 *
 * E.g. to get list of supported Tile IDs:
 *      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
 *      size_t num_tiles, size_written;
 *      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
 * CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
 *      }
 *
 * \param[in]  algo         The algo descriptor
 * \param[in]  attr         The attribute
 * \param[out] buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
 *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(const cublasLtMatmulAlgo_t* algo,
                                                              cublasLtMatmulAlgoCapAttributes_t attr,
                                                              void* buf,
                                                              size_t sizeInBytes,
                                                              size_t* sizeWritten);

/** Algo Configuration Attributes that can be set according to the Algo capabilities
 */
typedef enum {
  /** algorithm index, see cublasLtMatmulAlgoGetIds()
   *
   * readonly, set by cublasLtMatmulAlgoInit()
   * int32_t
   */
  CUBLASLT_ALGO_CONFIG_ID = 0,
  /** tile id, see cublasLtMatmulTile_t
   *
   * uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED
   */
  CUBLASLT_ALGO_CONFIG_TILE_ID = 1,
  /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts
   * of matrix multiplication will be computed in parallel. The results will be accumulated
   * according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
   *
   * int32_t, default: 1
   */
  CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2,
  /** reduction scheme, see cublasLtReductionScheme_t
   *
   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE
   */
  CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3,
  /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices
   *
   * possible values: 0, 1, other values reserved
   *
   * uint32_t, default: 0
   */
  CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4,
  /** custom option, each algorithm can support some custom options that don't fit description of the other config
   * attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case
   *
   * uint32_t, default: 0
   */
  CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5,
  /** stages id, see cublasLtMatmulStages_t
   *
   * uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED
   */
  CUBLASLT_ALGO_CONFIG_STAGES_ID = 6,
  /** inner shape id, see cublasLtMatmulInnerShape_t
   *
   * uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED)
   */
  CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7,
  /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use.
   *
   * uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO)
   */
  CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8,
} cublasLtMatmulAlgoConfigAttributes_t;

/** Set algo configuration attribute.
 *
 * \param[in]  algo         The algo descriptor
 * \param[in]  attr         The attribute
 * \param[in]  buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(cublasLtMatmulAlgo_t* algo,
                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
                                                                 const void* buf,
                                                                 size_t sizeInBytes);

/** Get algo configuration attribute.
 *
 * \param[in]  algo         The algo descriptor
 * \param[in]  attr         The attribute
 * \param[out] buf          memory address containing the new value
 * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
 * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
 *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
 *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
 *                                          selected attribute
 * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
 */
cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(const cublasLtMatmulAlgo_t* algo,
                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
                                                                 void* buf,
                                                                 size_t sizeInBytes,
                                                                 size_t* sizeWritten);

/** Experimental: Logger callback type.
 */
typedef void (*cublasLtLoggerCallback_t)(int logLevel, const char* functionName, const char* message);

/** Experimental: Logger callback setter.
 *
 * \param[in]  callback                     a user defined callback function to be called by the logger
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerSetCallback(cublasLtLoggerCallback_t callback);

/** Experimental: Log file setter.
 *
 * \param[in]  file                         an open file with write permissions
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerSetFile(FILE* file);

/** Experimental: Open log file.
 *
 * \param[in]  logFile                      log file path. if the log file does not exist, it will be created
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerOpenFile(const char* logFile);

/** Experimental: Log level setter.
 *
 * \param[in]  level                        log level, should be one of the following:
 *                                          0. Off
 *                                          1. Errors
 *                                          2. Performance Trace
 *                                          3. Performance Hints
 *                                          4. Heuristics Trace
 *                                          5. API Trace
 *
 * \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerSetLevel(int level);

/** Experimental: Log mask setter.
 *
 * \param[in]  mask                         log mask, should be a combination of the following masks:
 *                                          0.  Off
 *                                          1.  Errors
 *                                          2.  Performance Trace
 *                                          4.  Performance Hints
 *                                          8.  Heuristics Trace
 *                                          16. API Trace
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerSetMask(int mask);

/** Experimental: Disable logging for the entire session.
 *
 * \retval     CUBLAS_STATUS_SUCCESS        if disabled logging
 */
cublasStatus_t CUBLASWINAPI cublasLtLoggerForceDisable();

#if defined(__cplusplus)
}
#endif /* __cplusplus */