/* Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #pragma once #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H #if defined(__HIPCC_RTC__) #define __HOST_DEVICE__ __device__ #else #define __HOST_DEVICE__ __host__ __device__ #include #include "hip/amd_detail/host_defines.h" #include #if defined(__cplusplus) #include #include #include #endif #endif // !defined(__HIPCC_RTC__) #if defined(__clang__) && defined(__HIP__) typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2))); struct __half_raw { union { static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); _Float16 data; unsigned short x; }; }; struct __half2_raw { union { static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); struct { __half_raw x; __half_raw y; }; _Float16_2 data; }; }; #if defined(__cplusplus) #if !defined(__HIPCC_RTC__) #include "hip_fp16_math_fwd.h" #include "amd_hip_vector_types.h" #include "host_defines.h" #include "amd_device_functions.h" #include "amd_warp_functions.h" #endif namespace std { template<> struct is_floating_point<_Float16> : std::true_type {}; } template using Enable_if_t = typename std::enable_if::type; // BEGIN STRUCT __HALF struct __half { protected: union { static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); _Float16 data; unsigned short __x; }; public: // CREATORS __HOST_DEVICE__ __half() = default; __HOST_DEVICE__ __half(const __half_raw& x) : data{x.data} {} #if !defined(__HIP_NO_HALF_CONVERSIONS__) __HOST_DEVICE__ __half(decltype(data) x) : data{x} {} template< typename T, Enable_if_t{}>* = nullptr> __HOST_DEVICE__ __half(T x) : data{static_cast<_Float16>(x)} {} #endif __HOST_DEVICE__ __half(const __half&) = default; __HOST_DEVICE__ __half(__half&&) = default; __HOST_DEVICE__ ~__half() = default; // CREATORS - DEVICE ONLY #if !defined(__HIP_NO_HALF_CONVERSIONS__) template< typename T, Enable_if_t{}>* = nullptr> __HOST_DEVICE__ __half(T x) : data{static_cast<_Float16>(x)} {} #endif // MANIPULATORS __HOST_DEVICE__ __half& operator=(const __half&) = default; __HOST_DEVICE__ __half& operator=(__half&&) = default; __HOST_DEVICE__ __half& operator=(const __half_raw& x) { data = x.data; return *this; } __HOST_DEVICE__ volatile __half& operator=(const __half_raw& x) volatile { data = x.data; return *this; } volatile __half& operator=(const volatile __half_raw& x) volatile { data = x.data; return *this; } __half& operator=(__half_raw&& x) { data = x.data; return *this; } volatile __half& operator=(__half_raw&& x) volatile { data = x.data; return *this; } volatile __half& operator=(volatile __half_raw&& x) volatile { data = x.data; return *this; } #if !defined(__HIP_NO_HALF_CONVERSIONS__) template< typename T, Enable_if_t{}>* = nullptr> __HOST_DEVICE__ __half& operator=(T x) { data = static_cast<_Float16>(x); return *this; } #endif // MANIPULATORS - DEVICE ONLY #if !defined(__HIP_NO_HALF_CONVERSIONS__) template< typename T, Enable_if_t{}>* = nullptr> __device__ __half& operator=(T x) { data = static_cast<_Float16>(x); return *this; } #endif #if !defined(__HIP_NO_HALF_OPERATORS__) __device__ __half& operator+=(const __half& x) { data += x.data; return *this; } __device__ __half& operator-=(const __half& x) { data -= x.data; return *this; } __device__ __half& operator*=(const __half& x) { data *= x.data; return *this; } __device__ __half& operator/=(const __half& x) { data /= x.data; return *this; } __device__ __half& operator++() { ++data; return *this; } __device__ __half operator++(int) { __half tmp{*this}; ++*this; return tmp; } __device__ __half& operator--() { --data; return *this; } __device__ __half operator--(int) { __half tmp{*this}; --*this; return tmp; } #endif // ACCESSORS #if !defined(__HIP_NO_HALF_CONVERSIONS__) template< typename T, Enable_if_t{}>* = nullptr> __HOST_DEVICE__ operator T() const { return data; } #endif __HOST_DEVICE__ operator __half_raw() const { return __half_raw{data}; } __HOST_DEVICE__ operator __half_raw() const volatile { return __half_raw{data}; } #if !defined(__HIP_NO_HALF_CONVERSIONS__) template< typename T, Enable_if_t{}>* = nullptr> __HOST_DEVICE__ operator T() const { return data; } #endif #if !defined(__HIP_NO_HALF_OPERATORS__) __device__ __half operator+() const { return *this; } __device__ __half operator-() const { __half tmp{*this}; tmp.data = -tmp.data; return tmp; } #endif // FRIENDS #if !defined(__HIP_NO_HALF_OPERATORS__) friend inline __device__ __half operator+(const __half& x, const __half& y) { return __half{x} += y; } friend inline __device__ __half operator-(const __half& x, const __half& y) { return __half{x} -= y; } friend inline __device__ __half operator*(const __half& x, const __half& y) { return __half{x} *= y; } friend inline __device__ __half operator/(const __half& x, const __half& y) { return __half{x} /= y; } friend inline __device__ bool operator==(const __half& x, const __half& y) { return x.data == y.data; } friend inline __device__ bool operator!=(const __half& x, const __half& y) { return !(x == y); } friend inline __device__ bool operator<(const __half& x, const __half& y) { return x.data < y.data; } friend inline __device__ bool operator>(const __half& x, const __half& y) { return y.data < x.data; } friend inline __device__ bool operator<=(const __half& x, const __half& y) { return !(y < x); } friend inline __device__ bool operator>=(const __half& x, const __half& y) { return !(x < y); } #endif // !defined(__HIP_NO_HALF_OPERATORS__) }; // END STRUCT __HALF // BEGIN STRUCT __HALF2 struct __half2 { public: union { static_assert( sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); struct { __half x; __half y; }; _Float16_2 data; }; // CREATORS __HOST_DEVICE__ __half2() = default; __HOST_DEVICE__ __half2(const __half2_raw& xx) : data{xx.data} {} __HOST_DEVICE__ __half2(decltype(data) xx) : data{xx} {} __HOST_DEVICE__ __half2(const __half& xx, const __half& yy) : data{static_cast<__half_raw>(xx).data, static_cast<__half_raw>(yy).data} {} __HOST_DEVICE__ __half2(const __half2&) = default; __HOST_DEVICE__ __half2(__half2&&) = default; __HOST_DEVICE__ ~__half2() = default; // MANIPULATORS __HOST_DEVICE__ __half2& operator=(const __half2&) = default; __HOST_DEVICE__ __half2& operator=(__half2&&) = default; __HOST_DEVICE__ __half2& operator=(const __half2_raw& xx) { data = xx.data; return *this; } // MANIPULATORS - DEVICE ONLY #if !defined(__HIP_NO_HALF_OPERATORS__) __device__ __half2& operator+=(const __half2& xx) { data += xx.data; return *this; } __device__ __half2& operator-=(const __half2& xx) { data -= xx.data; return *this; } __device__ __half2& operator*=(const __half2& xx) { data *= xx.data; return *this; } __device__ __half2& operator/=(const __half2& xx) { data /= xx.data; return *this; } __device__ __half2& operator++() { return *this += _Float16_2{1, 1}; } __device__ __half2 operator++(int) { __half2 tmp{*this}; ++*this; return tmp; } __device__ __half2& operator--() { return *this -= _Float16_2{1, 1}; } __device__ __half2 operator--(int) { __half2 tmp{*this}; --*this; return tmp; } #endif // ACCESSORS __HOST_DEVICE__ operator decltype(data)() const { return data; } __HOST_DEVICE__ operator __half2_raw() const { __half2_raw r; r.data = data; return r; } // ACCESSORS - DEVICE ONLY #if !defined(__HIP_NO_HALF_OPERATORS__) __device__ __half2 operator+() const { return *this; } __device__ __half2 operator-() const { __half2 tmp{*this}; tmp.data = -tmp.data; return tmp; } #endif // FRIENDS #if !defined(__HIP_NO_HALF_OPERATORS__) friend inline __device__ __half2 operator+(const __half2& xx, const __half2& yy) { return __half2{xx} += yy; } friend inline __device__ __half2 operator-(const __half2& xx, const __half2& yy) { return __half2{xx} -= yy; } friend inline __device__ __half2 operator*(const __half2& xx, const __half2& yy) { return __half2{xx} *= yy; } friend inline __device__ __half2 operator/(const __half2& xx, const __half2& yy) { return __half2{xx} /= yy; } friend inline __device__ bool operator==(const __half2& xx, const __half2& yy) { auto r = xx.data == yy.data; return r.x != 0 && r.y != 0; } friend inline __device__ bool operator!=(const __half2& xx, const __half2& yy) { return !(xx == yy); } friend inline __device__ bool operator<(const __half2& xx, const __half2& yy) { auto r = xx.data < yy.data; return r.x != 0 && r.y != 0; } friend inline __device__ bool operator>(const __half2& xx, const __half2& yy) { return yy < xx; } friend inline __device__ bool operator<=(const __half2& xx, const __half2& yy) { return !(yy < xx); } friend inline __device__ bool operator>=(const __half2& xx, const __half2& yy) { return !(xx < yy); } #endif // !defined(__HIP_NO_HALF_OPERATORS__) }; // END STRUCT __HALF2 namespace { inline __HOST_DEVICE__ __half2 make_half2(__half x, __half y) { return __half2{x, y}; } inline __HOST_DEVICE__ __half __low2half(__half2 x) { return __half{__half_raw{static_cast<__half2_raw>(x).data.x}}; } inline __HOST_DEVICE__ __half __high2half(__half2 x) { return __half{__half_raw{static_cast<__half2_raw>(x).data.y}}; } inline __HOST_DEVICE__ __half2 __half2half2(__half x) { return __half2{x, x}; } inline __HOST_DEVICE__ __half2 __halves2half2(__half x, __half y) { return __half2{x, y}; } inline __HOST_DEVICE__ __half2 __low2half2(__half2 x) { return __half2{ _Float16_2{ static_cast<__half2_raw>(x).data.x, static_cast<__half2_raw>(x).data.x}}; } inline __HOST_DEVICE__ __half2 __high2half2(__half2 x) { return __half2{ _Float16_2{ static_cast<__half2_raw>(x).data.y, static_cast<__half2_raw>(x).data.y}}; } inline __HOST_DEVICE__ __half2 __lows2half2(__half2 x, __half2 y) { return __half2{ _Float16_2{ static_cast<__half2_raw>(x).data.x, static_cast<__half2_raw>(y).data.x}}; } inline __HOST_DEVICE__ __half2 __highs2half2(__half2 x, __half2 y) { return __half2{ _Float16_2{ static_cast<__half2_raw>(x).data.y, static_cast<__half2_raw>(y).data.y}}; } inline __HOST_DEVICE__ __half2 __lowhigh2highlow(__half2 x) { return __half2{ _Float16_2{ static_cast<__half2_raw>(x).data.y, static_cast<__half2_raw>(x).data.x}}; } // Bitcasts inline __device__ short __half_as_short(__half x) { return static_cast<__half_raw>(x).x; } inline __device__ unsigned short __half_as_ushort(__half x) { return static_cast<__half_raw>(x).x; } inline __device__ __half __short_as_half(short x) { __half_raw r; r.x = x; return r; } inline __device__ __half __ushort_as_half(unsigned short x) { __half_raw r; r.x = x; return r; } // float -> half | half2 inline __HOST_DEVICE__ __half __float2half(float x) { return __half_raw{static_cast<_Float16>(x)}; } inline __HOST_DEVICE__ __half __float2half_rn(float x) { return __half_raw{static_cast<_Float16>(x)}; } #if !defined(__HIPCC_RTC__) // TODO: rounding behaviour is not correct for host functions. inline __host__ __half __float2half_rz(float x) { return __half_raw{static_cast<_Float16>(x)}; } inline __host__ __half __float2half_rd(float x) { return __half_raw{static_cast<_Float16>(x)}; } inline __host__ __half __float2half_ru(float x) { return __half_raw{static_cast<_Float16>(x)}; } #endif inline __device__ __half __float2half_rz(float x) { return __half_raw{__ocml_cvtrtz_f16_f32(x)}; } inline __device__ __half __float2half_rd(float x) { return __half_raw{__ocml_cvtrtn_f16_f32(x)}; } inline __device__ __half __float2half_ru(float x) { return __half_raw{__ocml_cvtrtp_f16_f32(x)}; } inline __HOST_DEVICE__ __half2 __float2half2_rn(float x) { return __half2{ _Float16_2{ static_cast<_Float16>(x), static_cast<_Float16>(x)}}; } inline __HOST_DEVICE__ __half2 __floats2half2_rn(float x, float y) { return __half2{_Float16_2{ static_cast<_Float16>(x), static_cast<_Float16>(y)}}; } inline __HOST_DEVICE__ __half2 __float22half2_rn(float2 x) { return __floats2half2_rn(x.x, x.y); } // half | half2 -> float inline __HOST_DEVICE__ float __half2float(__half x) { return static_cast<__half_raw>(x).data; } inline __HOST_DEVICE__ float __low2float(__half2 x) { return static_cast<__half2_raw>(x).data.x; } inline __HOST_DEVICE__ float __high2float(__half2 x) { return static_cast<__half2_raw>(x).data.y; } inline __HOST_DEVICE__ float2 __half22float2(__half2 x) { return make_float2( static_cast<__half2_raw>(x).data.x, static_cast<__half2_raw>(x).data.y); } // half -> int inline __device__ int __half2int_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ int __half2int_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ int __half2int_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ int __half2int_ru(__half x) { return static_cast<__half_raw>(x).data; } // int -> half inline __device__ __half __int2half_rn(int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __int2half_rz(int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __int2half_rd(int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __int2half_ru(int x) { return __half_raw{static_cast<_Float16>(x)}; } // half -> short inline __device__ short __half2short_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ short __half2short_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ short __half2short_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ short __half2short_ru(__half x) { return static_cast<__half_raw>(x).data; } // short -> half inline __device__ __half __short2half_rn(short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __short2half_rz(short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __short2half_rd(short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __short2half_ru(short x) { return __half_raw{static_cast<_Float16>(x)}; } // half -> long long inline __device__ long long __half2ll_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ long long __half2ll_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ long long __half2ll_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ long long __half2ll_ru(__half x) { return static_cast<__half_raw>(x).data; } // long long -> half inline __device__ __half __ll2half_rn(long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ll2half_rz(long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ll2half_rd(long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ll2half_ru(long long x) { return __half_raw{static_cast<_Float16>(x)}; } // half -> unsigned int inline __device__ unsigned int __half2uint_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned int __half2uint_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned int __half2uint_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned int __half2uint_ru(__half x) { return static_cast<__half_raw>(x).data; } // unsigned int -> half inline __device__ __half __uint2half_rn(unsigned int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __uint2half_rz(unsigned int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __uint2half_rd(unsigned int x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __uint2half_ru(unsigned int x) { return __half_raw{static_cast<_Float16>(x)}; } // half -> unsigned short inline __device__ unsigned short __half2ushort_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned short __half2ushort_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned short __half2ushort_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned short __half2ushort_ru(__half x) { return static_cast<__half_raw>(x).data; } // unsigned short -> half inline __device__ __half __ushort2half_rn(unsigned short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ushort2half_rz(unsigned short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ushort2half_rd(unsigned short x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ushort2half_ru(unsigned short x) { return __half_raw{static_cast<_Float16>(x)}; } // half -> unsigned long long inline __device__ unsigned long long __half2ull_rn(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned long long __half2ull_rz(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned long long __half2ull_rd(__half x) { return static_cast<__half_raw>(x).data; } inline __device__ unsigned long long __half2ull_ru(__half x) { return static_cast<__half_raw>(x).data; } // unsigned long long -> half inline __device__ __half __ull2half_rn(unsigned long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ull2half_rz(unsigned long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ull2half_rd(unsigned long long x) { return __half_raw{static_cast<_Float16>(x)}; } inline __device__ __half __ull2half_ru(unsigned long long x) { return __half_raw{static_cast<_Float16>(x)}; } // Load primitives inline __device__ __half __ldg(const __half* ptr) { return *ptr; } inline __device__ __half __ldcg(const __half* ptr) { return *ptr; } inline __device__ __half __ldca(const __half* ptr) { return *ptr; } inline __device__ __half __ldcs(const __half* ptr) { return *ptr; } inline __HOST_DEVICE__ __half2 __ldg(const __half2* ptr) { return *ptr; } inline __HOST_DEVICE__ __half2 __ldcg(const __half2* ptr) { return *ptr; } inline __HOST_DEVICE__ __half2 __ldca(const __half2* ptr) { return *ptr; } inline __HOST_DEVICE__ __half2 __ldcs(const __half2* ptr) { return *ptr; } // Relations inline __device__ bool __heq(__half x, __half y) { return static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data; } inline __device__ bool __hne(__half x, __half y) { return static_cast<__half_raw>(x).data != static_cast<__half_raw>(y).data; } inline __device__ bool __hle(__half x, __half y) { return static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data; } inline __device__ bool __hge(__half x, __half y) { return static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data; } inline __device__ bool __hlt(__half x, __half y) { return static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data; } inline __device__ bool __hgt(__half x, __half y) { return static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data; } inline __device__ bool __hequ(__half x, __half y) { return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) && !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); } inline __device__ bool __hneu(__half x, __half y) { return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data); } inline __device__ bool __hleu(__half x, __half y) { return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data); } inline __device__ bool __hgeu(__half x, __half y) { return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data); } inline __device__ bool __hltu(__half x, __half y) { return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data); } inline __device__ bool __hgtu(__half x, __half y) { return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data); } inline __HOST_DEVICE__ __half2 __heq2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hne2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data != static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hle2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hge2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hlt2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hgt2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data; return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hequ2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) && !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hneu2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hleu2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hgeu2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hltu2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ __half2 __hgtu2(__half2 x, __half2 y) { auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data); return __builtin_convertvector(-r, _Float16_2); } inline __HOST_DEVICE__ bool __hbeq2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__heq2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hbne2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hne2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hble2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hle2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hbge2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hge2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hblt2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hlt2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hbgt2(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hgt2(x, y)); return r.data.x != 0 && r.data.y != 0; } inline __HOST_DEVICE__ bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); } inline __HOST_DEVICE__ bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); } inline __HOST_DEVICE__ bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); } inline __HOST_DEVICE__ bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); } inline __HOST_DEVICE__ bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); } inline __HOST_DEVICE__ bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); } inline __device__ __half __hmax(const __half x, const __half y) { return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data, static_cast<__half_raw>(y).data)}; } inline __device__ __half __hmax_nan(const __half x, const __half y) { if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) { return x; } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) { return y; } return __hmax(x, y); } inline __device__ __half __hmin(const __half x, const __half y) { return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data, static_cast<__half_raw>(y).data)}; } inline __device__ __half __hmin_nan(const __half x, const __half y) { if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) { return x; } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) { return y; } return __hmin(x, y); } // Arithmetic inline __device__ __half __clamp_01(__half x) { auto r = static_cast<__half_raw>(x); if (__hlt(x, __half_raw{0})) return __half_raw{0}; if (__hlt(__half_raw{1}, x)) return __half_raw{1}; return r; } inline __device__ __half __hadd(__half x, __half y) { return __half_raw{ static_cast<__half_raw>(x).data + static_cast<__half_raw>(y).data}; } inline __device__ __half __habs(__half x) { return __half_raw{ __ocml_fabs_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half __hsub(__half x, __half y) { return __half_raw{ static_cast<__half_raw>(x).data - static_cast<__half_raw>(y).data}; } inline __device__ __half __hmul(__half x, __half y) { return __half_raw{ static_cast<__half_raw>(x).data * static_cast<__half_raw>(y).data}; } inline __device__ __half __hadd_sat(__half x, __half y) { return __clamp_01(__hadd(x, y)); } inline __device__ __half __hsub_sat(__half x, __half y) { return __clamp_01(__hsub(x, y)); } inline __device__ __half __hmul_sat(__half x, __half y) { return __clamp_01(__hmul(x, y)); } inline __device__ __half __hfma(__half x, __half y, __half z) { return __half_raw{__ocml_fma_f16( static_cast<__half_raw>(x).data, static_cast<__half_raw>(y).data, static_cast<__half_raw>(z).data)}; } inline __device__ __half __hfma_sat(__half x, __half y, __half z) { return __clamp_01(__hfma(x, y, z)); } inline __device__ __half __hdiv(__half x, __half y) { return __half_raw{ static_cast<__half_raw>(x).data / static_cast<__half_raw>(y).data}; } inline __HOST_DEVICE__ __half2 __hadd2(__half2 x, __half2 y) { return __half2{ static_cast<__half2_raw>(x).data + static_cast<__half2_raw>(y).data}; } inline __HOST_DEVICE__ __half2 __habs2(__half2 x) { return __half2{ __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)}; } inline __HOST_DEVICE__ __half2 __hsub2(__half2 x, __half2 y) { return __half2{ static_cast<__half2_raw>(x).data - static_cast<__half2_raw>(y).data}; } inline __HOST_DEVICE__ __half2 __hmul2(__half2 x, __half2 y) { return __half2{ static_cast<__half2_raw>(x).data * static_cast<__half2_raw>(y).data}; } inline __HOST_DEVICE__ __half2 __hadd2_sat(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hadd2(x, y)); return __half2{ __clamp_01(__half_raw{r.data.x}), __clamp_01(__half_raw{r.data.y})}; } inline __HOST_DEVICE__ __half2 __hsub2_sat(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hsub2(x, y)); return __half2{ __clamp_01(__half_raw{r.data.x}), __clamp_01(__half_raw{r.data.y})}; } inline __HOST_DEVICE__ __half2 __hmul2_sat(__half2 x, __half2 y) { auto r = static_cast<__half2_raw>(__hmul2(x, y)); return __half2{ __clamp_01(__half_raw{r.data.x}), __clamp_01(__half_raw{r.data.y})}; } inline __HOST_DEVICE__ __half2 __hfma2(__half2 x, __half2 y, __half2 z) { return __half2{__ocml_fma_2f16(x, y, z)}; } inline __HOST_DEVICE__ __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z) { auto r = static_cast<__half2_raw>(__hfma2(x, y, z)); return __half2{ __clamp_01(__half_raw{r.data.x}), __clamp_01(__half_raw{r.data.y})}; } inline __HOST_DEVICE__ __half2 __h2div(__half2 x, __half2 y) { return __half2{ static_cast<__half2_raw>(x).data / static_cast<__half2_raw>(y).data}; } // Math functions #if defined(__clang__) && defined(__HIP__) inline __device__ float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) { return __ockl_fdot2(static_cast<__half2_raw>(a).data, static_cast<__half2_raw>(b).data, c, saturate); } #endif inline __device__ __half htrunc(__half x) { return __half_raw{ __ocml_trunc_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hceil(__half x) { return __half_raw{ __ocml_ceil_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hfloor(__half x) { return __half_raw{ __ocml_floor_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hrint(__half x) { return __half_raw{ __ocml_rint_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hsin(__half x) { return __half_raw{ __ocml_sin_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hcos(__half x) { return __half_raw{ __ocml_cos_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hexp(__half x) { return __half_raw{ __ocml_exp_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hexp2(__half x) { return __half_raw{ __ocml_exp2_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hexp10(__half x) { return __half_raw{ __ocml_exp10_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hlog2(__half x) { return __half_raw{ __ocml_log2_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hlog(__half x) { return __half_raw{ __ocml_log_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hlog10(__half x) { return __half_raw{ __ocml_log10_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hrcp(__half x) { return __half_raw{ static_cast<_Float16>(1.0f) /static_cast<__half_raw>(x).data}; } inline __device__ __half hrsqrt(__half x) { return __half_raw{ __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)}; } inline __device__ __half hsqrt(__half x) { return __half_raw{ __ocml_sqrt_f16(static_cast<__half_raw>(x).data)}; } inline __device__ bool __hisinf(__half x) { return __ocml_isinf_f16(static_cast<__half_raw>(x).data); } inline __device__ bool __hisnan(__half x) { return __ocml_isnan_f16(static_cast<__half_raw>(x).data); } inline __device__ __half __hneg(__half x) { return __half_raw{-static_cast<__half_raw>(x).data}; } inline __HOST_DEVICE__ __half2 h2trunc(__half2 x) { return __half2{__ocml_trunc_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2ceil(__half2 x) { return __half2{__ocml_ceil_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2floor(__half2 x) { return __half2{__ocml_floor_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2rint(__half2 x) { return __half2{__ocml_rint_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2sin(__half2 x) { return __half2{__ocml_sin_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2cos(__half2 x) { return __half2{__ocml_cos_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2exp(__half2 x) { return __half2{__ocml_exp_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2exp2(__half2 x) { return __half2{__ocml_exp2_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2exp10(__half2 x) { return __half2{__ocml_exp10_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2log2(__half2 x) { return __half2{__ocml_log2_2f16(x)}; } inline __HOST_DEVICE__ __half2 h2log(__half2 x) { return __ocml_log_2f16(x); } inline __HOST_DEVICE__ __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } inline __HOST_DEVICE__ __half2 h2rcp(__half2 x) { return _Float16_2{ _Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data}; } inline __HOST_DEVICE__ __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } inline __HOST_DEVICE__ __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); } inline __HOST_DEVICE__ __half2 __hisinf2(__half2 x) { auto r = __ocml_isinf_2f16(x); return __half2{_Float16_2{ static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; } inline __HOST_DEVICE__ __half2 __hisnan2(__half2 x) { auto r = __ocml_isnan_2f16(x); return __half2{_Float16_2{ static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}}; } inline __HOST_DEVICE__ __half2 __hneg2(__half2 x) { return __half2{-static_cast<__half2_raw>(x).data}; } } // Anonymous namespace. #if !defined(HIP_NO_HALF) using half = __half; using half2 = __half2; #endif __device__ inline __half __shfl(__half var, int src_lane, int width = warpSize) { union { int i; __half h; } tmp; tmp.h = var; tmp.i = __shfl(tmp.i, src_lane, width); return tmp.h; } __device__ inline __half2 __shfl(__half2 var, int src_lane, int width = warpSize) { union { int i; __half2 h; } tmp; tmp.h = var; tmp.i = __shfl(tmp.i, src_lane, width); return tmp.h; } __device__ inline __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) { union { int i; __half h; } tmp; tmp.h = var; tmp.i = __shfl_up(tmp.i, lane_delta, width); return tmp.h; } __device__ inline __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) { union { int i; __half2 h; } tmp; tmp.h = var; tmp.i = __shfl_up(tmp.i, lane_delta, width); return tmp.h; } __device__ inline __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) { union { int i; __half h; } tmp; tmp.h = var; tmp.i = __shfl_down(tmp.i, lane_delta, width); return tmp.h; } __device__ inline __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) { union { int i; __half2 h; } tmp; tmp.h = var; tmp.i = __shfl_down(tmp.i, lane_delta, width); return tmp.h; } __device__ inline __half __shfl_xor(__half var, int lane_mask, int width = warpSize) { union { int i; __half h; } tmp; tmp.h = var; tmp.i = __shfl_xor(tmp.i, lane_mask, width); return tmp.h; } __device__ inline __half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) { union { int i; __half2 h; } tmp; tmp.h = var; tmp.i = __shfl_xor(tmp.i, lane_mask, width); return tmp.h; } #endif // defined(__cplusplus) #elif defined(__GNUC__) #if !defined(__HIPCC_RTC__) #include "hip_fp16_gcc.h" #endif #endif // !defined(__clang__) && defined(__GNUC__) #endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H