//------------------------------------------------------------------------------ // GraphBLAS/CUDA/Template/GB_cuda_atomics.cuh: CUDA atomics for GraphBLAS //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved. // This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3-Clause //------------------------------------------------------------------------------ // Atomic device functions for CUDA JIT kernels. Not used on the host. /* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ //------------------------------------------------------------------------------ // Specializations for different atomic operations on different types //------------------------------------------------------------------------------ // No 1- or 2-byte methods are available (bool, uint8_t, int8_t, uint16_t, // int16_t), because CUDA does not support atomicCAS for just one or two bytes. // Instead, to compute one or two bytes atomically, GraphBLAS must operate on a // larger temporary type (typically uint32_t) and when all results are computed // and the kernel launch is done, the final value is copied to the one or two // bytes result on the host. // // The GxB_FC64_t type is supported only by GB_cuda_atomic_add. // // GB_cuda_atomic_write, GB_cuda_atomic_times: // // int32_t, uint32_t, int64_t, uint64_t, // float, double, and GxB_FC32_t (not GxB_FC64_t). // // GB_cuda_atomic_min, GB_cuda_atomic_max: // // int32_t, uint32_t, int64_t, uint64_t, // float, and double (not GxB_FC32_t or GxB_FC64_t). // // GB_cuda_atomic_add: // // int32_t, uint32_t, int64_t, uint64_t, // float, double, GxB_FC32_t, and GxB_FC64_t. // // GB_cuda_atomic_bor, GB_cuda_atomic_band, // GB_cuda_atomic_bxor, GB_cuda_atomic_bxnor : // // uint32_t, uint64_t // // GB_cuda_atomic_lock, GB_cuda_atomic_unlock: // // uint32_t only // // GB_PUN is #defined in GB_pun.h as: // #define GB_PUN(type,value) (*((type *) (&(value)))) // which allows a value to be interpretted as another type but with no // typecasting. The value parameter must be an lvalue, not an expression. #pragma once template __device__ void GB_cuda_atomic_write (T* ptr, T val) ; template __device__ void GB_cuda_atomic_add (T* ptr, T val) ; template __device__ void GB_cuda_atomic_times (T* ptr, T val) ; template __device__ void GB_cuda_atomic_min (T* ptr, T val) ; template __device__ void GB_cuda_atomic_max (T* ptr, T val) ; template __device__ void GB_cuda_atomic_bor (T* ptr, T val) ; template __device__ void GB_cuda_atomic_band (T* ptr, T val) ; template __device__ void GB_cuda_atomic_bxor (T* ptr, T val) ; template __device__ void GB_cuda_atomic_bxnor (T* ptr, T val) ; __device__ __inline__ void GB_cuda_lock (uint32_t *mutex) ; __device__ __inline__ void GB_cuda_unlock (uint32_t *mutex) ; //------------------------------------------------------------------------------ // GB_cuda_atomic_write //------------------------------------------------------------------------------ // atomic write (32 and 64 bits) // no atomic write for GxB_FC64_t template<> __device__ __inline__ void GB_cuda_atomic_write ( int32_t *ptr, // target to modify int32_t val // value to modify the target with ) { // native CUDA method atomicExch ((int *) ptr, (int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_write ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicExch ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_write ( int64_t *ptr, // target to modify int64_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = GB_PUN (unsigned long long int, val) ; atomicExch (p, v) ; } template<> __device__ __inline__ void GB_cuda_atomic_write ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicExch ((unsigned long long int *) ptr, (unsigned long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_write ( float *ptr, // target to modify float val // value to modify the target with ) { // native CUDA method atomicExch (ptr, val) ; } template<> __device__ __inline__ void GB_cuda_atomic_write ( double *ptr, // target to modify double val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = GB_PUN (unsigned long long int, val) ; atomicExch (p, v) ; } #if 0 template<> __device__ __inline__ void GB_cuda_atomic_write ( GxB_FC32_t *ptr, // target to modify GxB_FC32_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = GB_PUN (unsigned long long int, val) ; atomicExch (p, v) ; } #endif //------------------------------------------------------------------------------ // GB_cuda_atomic_add for built-in types //------------------------------------------------------------------------------ // types: int and uint [32,64], float, double, GxB_FC32_t, complex double template<> __device__ __inline__ void GB_cuda_atomic_add ( int32_t *ptr, // target to modify int32_t val // value to modify the target with ) { // native CUDA method atomicAdd ((int *) ptr, (int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicAdd ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( int64_t *ptr, // target to modify int64_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = GB_PUN (unsigned long long int, val) ; atomicAdd (p, v) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicAdd ((unsigned long long int *) ptr, (unsigned long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( float *ptr, // target to modify float val // value to modify the target with ) { // native CUDA method atomicAdd (ptr, val) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( double *ptr, // target to modify double val // value to modify the target with ) { // native CUDA method atomicAdd (ptr, val) ; } #if 0 template<> __device__ __inline__ void GB_cuda_atomic_add ( GxB_FC32_t *ptr, // target to modify GxB_FC32_t val // value to modify the target with ) { // native CUDA method on each float, real and imaginary parts float *p = (float *) ptr ; atomicAdd (p , GB_crealf (val)) ; atomicAdd (p+1, GB_cimagf (val)) ; } template<> __device__ __inline__ void GB_cuda_atomic_add ( GxB_FC64_t *ptr, // target to modify GxB_FC64_t val // value to modify the target with ) { // native CUDA method on each double, real and imaginary parts double *p = (double *) ptr ; atomicAdd (p , GB_creal (val)) ; atomicAdd (p+1, GB_cimag (val)) ; } #endif //------------------------------------------------------------------------------ // GB_cuda_atomic_times for built-in types //------------------------------------------------------------------------------ // types: int and uint [32,64], float, double, GxB_FC32_t // no GxB_FC64_t. template<> __device__ __inline__ void GB_cuda_atomic_times ( int32_t *ptr, // target to modify int32_t val // value to modify the target with ) { int *p = (int *) ptr ; int assumed ; int old = *p ; do { // assume the old value assumed = old ; // compute the new value int32_t new_value = GB_PUN (int32_t, assumed) * val ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (int, new_value)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_times ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { unsigned int *p = (unsigned int *) ptr ; unsigned int v = (unsigned int) val ; unsigned int assumed ; unsigned int old = *p ; do { // assume the old value assumed = old ; // modify it atomically: old = atomicCAS (p, assumed, assumed * v) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_times ( int64_t *ptr, // target to modify int64_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // compute the new value int64_t new_value = GB_PUN (int64_t, assumed) * val ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned long long int, new_value)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_times ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = (unsigned long long int) val ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // modify it atomically: old = atomicCAS (p, assumed, assumed * v) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_times ( float *ptr, // target to modify float val // value to modify the target with ) { unsigned int *p = (unsigned int *) ptr ; unsigned int assumed ; unsigned int old = *p ; do { // assume the old value assumed = old ; // compute the new value float new_value = GB_PUN (float, assumed) * val ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned int, new_value)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_times ( double *ptr, // target to modify double val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // compute the new value double new_value = GB_PUN (double, assumed) * val ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned long long int, new_value)) ; } while (assumed != old) ; } #if 0 template<> __device__ __inline__ void GB_cuda_atomic_times ( GxB_FC32_t *ptr, // target to modify GxB_FC32_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // compute the new value GxB_FC32_t new_value = GB_PUN (GxB_FC32_t, assumed) * val ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned long long int, new_value)) ; } while (assumed != old) ; } #endif //------------------------------------------------------------------------------ // GB_cuda_atomic_min //------------------------------------------------------------------------------ // types: int and uint [32,64], float, and double // no complex types template<> __device__ __inline__ void GB_cuda_atomic_min ( int32_t *ptr, // target to modify int32_t val // value to modify the target with ) { // native CUDA method atomicMin ((int *) ptr, (int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_min ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicMin ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_min ( int64_t *ptr, // target to modify int64_t val // value to modify the target with ) { // native CUDA method atomicMin ((long long int *) ptr, (long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_min ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicMin ((unsigned long long int *)ptr, (unsigned long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_min ( float *ptr, // target to modify float val // value to modify the target with ) { unsigned int *p = (unsigned int *) ptr ; unsigned int assumed ; unsigned int old = *p ; do { // assume the old value assumed = old ; // compute the new value float new_value = fminf (GB_PUN (float, assumed), val) ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned int, new_value)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_min ( double *ptr, // target to modify double val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // compute the new value double new_value = fmin (GB_PUN (double, assumed), val) ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned long long int, new_value)) ; } while (assumed != old) ; } //------------------------------------------------------------------------------ // GB_cuda_atomic_max //------------------------------------------------------------------------------ // types: int and uint [32,64], float, and double // no complex types template<> __device__ __inline__ void GB_cuda_atomic_max ( int32_t *ptr, // target to modify int32_t val // value to modify the target with ) { // native CUDA method atomicMax ((int *) ptr, (int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_max ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicMax ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_max ( int64_t *ptr, // target to modify int64_t val // value to modify the target with ) { // native CUDA method atomicMax ((long long int *) ptr, (long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_max ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicMax ((unsigned long long int *)ptr, (unsigned long long int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_max ( float *ptr, // target to modify float val // value to modify the target with ) { unsigned int *p = (unsigned int *) ptr ; unsigned int assumed ; unsigned int old = *p ; do { // assume the old value assumed = old ; // compute the new value float new_value = fmaxf (GB_PUN (float, assumed), val) ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned int, new_value)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_max ( double *ptr, // target to modify double val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // compute the new value double new_value = fmax (GB_PUN (double, assumed), val) ; // modify it atomically: old = atomicCAS (p, assumed, GB_PUN (unsigned long long int, new_value)) ; } while (assumed != old) ; } //------------------------------------------------------------------------------ // GB_cuda_atomic_bor //------------------------------------------------------------------------------ // bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bor ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicOr ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_bor ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicOr ((unsigned long long int *) ptr, (unsigned long long int) val) ; } //------------------------------------------------------------------------------ // GB_cuda_atomic_band //------------------------------------------------------------------------------ // bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_band ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicAnd ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_band ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicAnd ((unsigned long long int *) ptr, (unsigned long long int) val) ; } //------------------------------------------------------------------------------ // GB_cuda_atomic_bxor //------------------------------------------------------------------------------ // bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bxor ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { // native CUDA method atomicXor ((unsigned int *) ptr, (unsigned int) val) ; } template<> __device__ __inline__ void GB_cuda_atomic_bxor ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { // native CUDA method atomicXor ((unsigned long long int *) ptr, (unsigned long long int) val) ; } //------------------------------------------------------------------------------ // GB_cuda_atomic_bxnor //------------------------------------------------------------------------------ // bitwise: on uint [32,64] template<> __device__ __inline__ void GB_cuda_atomic_bxnor ( uint32_t *ptr, // target to modify uint32_t val // value to modify the target with ) { unsigned int *p = (unsigned int *) ptr ; unsigned int v = (unsigned int) val ; unsigned int assumed ; unsigned int old = *p ; do { // assume the old value assumed = old ; // modify it atomically: old = atomicCAS (p, assumed, ~(assumed ^ v)) ; } while (assumed != old) ; } template<> __device__ __inline__ void GB_cuda_atomic_bxnor ( uint64_t *ptr, // target to modify uint64_t val // value to modify the target with ) { unsigned long long int *p = (unsigned long long int *) ptr ; unsigned long long int v = (unsigned long long int) val ; unsigned long long int assumed ; unsigned long long int old = *p ; do { // assume the old value assumed = old ; // modify it atomically: old = atomicCAS (p, assumed, ~(assumed ^ v)) ; } while (assumed != old) ; } //------------------------------------------------------------------------------ // GB_cuda_lock/unlock: set/clear a mutex for a critical section //------------------------------------------------------------------------------ __device__ __inline__ void GB_cuda_lock (uint32_t *mutex) { int old ; do { old = atomicCAS (mutex, 0, 1) ; } while (old == 1) ; } __device__ __inline__ void GB_cuda_unlock (uint32_t *mutex) { int old ; do { old = atomicCAS (mutex, 1, 0) ; } while (old == 0) ; }