#version 450 #if defined(GL_KHR_shader_subgroup_ballot) #extension GL_KHR_shader_subgroup_ballot : require #elif defined(GL_NV_shader_thread_group) #extension GL_NV_shader_thread_group : require #elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) #extension GL_ARB_shader_int64 : enable #extension GL_ARB_shader_ballot : require #else #error No extensions available to emulate requested subgroup feature. #endif #if defined(GL_KHR_shader_subgroup_basic) #extension GL_KHR_shader_subgroup_basic : require #elif defined(GL_NV_shader_thread_group) #extension GL_NV_shader_thread_group : require #elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) #extension GL_ARB_shader_int64 : enable #extension GL_ARB_shader_ballot : require #elif defined(GL_AMD_gcn_shader) && (defined(GL_AMD_gpu_shader_int64) || defined(GL_NV_gpu_shader5)) #extension GL_AMD_gpu_shader_int64 : enable #extension GL_NV_gpu_shader5 : enable #extension GL_AMD_gcn_shader : require #else #error No extensions available to emulate requested subgroup feature. #endif #if defined(GL_KHR_shader_subgroup_ballot) #extension GL_KHR_shader_subgroup_ballot : require #elif defined(GL_NV_shader_thread_group) #extension GL_NV_shader_thread_group : require #elif defined(GL_ARB_shader_ballot) && defined(GL_ARB_shader_int64) #extension GL_ARB_shader_int64 : enable #extension GL_ARB_shader_ballot : require #else #error No extensions available to emulate requested subgroup feature. #endif #if defined(GL_NV_shader_thread_group) #extension GL_NV_shader_thread_group : require #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #extension GL_KHR_shader_subgroup_arithmetic : require #elif defined(GL_NV_shader_thread_shuffle) #extension GL_NV_shader_thread_shuffle : require #else #error No extensions available to emulate requested subgroup feature. #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #extension GL_KHR_shader_subgroup_arithmetic : require #elif defined(GL_NV_shader_thread_shuffle) #extension GL_NV_shader_thread_shuffle : require #else #error No extensions available to emulate requested subgroup feature. #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #extension GL_KHR_shader_subgroup_arithmetic : require #elif defined(GL_NV_shader_thread_shuffle) #extension GL_NV_shader_thread_shuffle : require #else #error No extensions available to emulate requested subgroup feature. #endif layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in; layout(binding = 1, std430) buffer DATA_OUT { float data_out_float; vec2 data_out_vec2; vec3 data_out_vec3; vec4 data_out_vec4; double data_out_double; dvec2 data_out_dvec2; dvec3 data_out_dvec3; dvec4 data_out_dvec4; } _16; layout(binding = 0, std430) buffer DATA_IN { float data_in_float[128]; vec2 data_in_vec2[128]; vec3 data_in_vec3[128]; vec4 data_in_vec4[128]; double data_in_double[128]; dvec2 data_in_dvec2[128]; dvec3 data_in_dvec3[128]; dvec4 data_in_dvec4[128]; } _31; #if defined(GL_KHR_shader_subgroup_ballot) #elif defined(GL_NV_shader_thread_group) #define gl_SubgroupEqMask uvec4(gl_ThreadEqMaskNV, 0u, 0u, 0u) #define gl_SubgroupGeMask uvec4(gl_ThreadGeMaskNV, 0u, 0u, 0u) #define gl_SubgroupGtMask uvec4(gl_ThreadGtMaskNV, 0u, 0u, 0u) #define gl_SubgroupLeMask uvec4(gl_ThreadLeMaskNV, 0u, 0u, 0u) #define gl_SubgroupLtMask uvec4(gl_ThreadLtMaskNV, 0u, 0u, 0u) #elif defined(GL_ARB_shader_ballot) #define gl_SubgroupEqMask uvec4(unpackUint2x32(gl_SubGroupEqMaskARB), 0u, 0u) #define gl_SubgroupGeMask uvec4(unpackUint2x32(gl_SubGroupGeMaskARB), 0u, 0u) #define gl_SubgroupGtMask uvec4(unpackUint2x32(gl_SubGroupGtMaskARB), 0u, 0u) #define gl_SubgroupLeMask uvec4(unpackUint2x32(gl_SubGroupLeMaskARB), 0u, 0u) #define gl_SubgroupLtMask uvec4(unpackUint2x32(gl_SubGroupLtMaskARB), 0u, 0u) #endif #if defined(GL_KHR_shader_subgroup_basic) #elif defined(GL_NV_shader_thread_group) #define gl_SubgroupSize gl_WarpSizeNV #elif defined(GL_ARB_shader_ballot) #define gl_SubgroupSize gl_SubGroupSizeARB #elif defined(GL_AMD_gcn_shader) #define gl_SubgroupSize uint(gl_SIMDGroupSizeAMD) #endif #if defined(GL_KHR_shader_subgroup_ballot) #elif defined(GL_NV_shader_thread_group) uvec4 subgroupBallot(bool v) { return uvec4(ballotThreadNV(v), 0u, 0u, 0u); } #elif defined(GL_ARB_shader_ballot) uvec4 subgroupBallot(bool v) { return uvec4(unpackUint2x32(ballotARB(v)), 0u, 0u); } #endif #ifndef GL_KHR_shader_subgroup_basic bool subgroupElect() { uvec4 activeMask = subgroupBallot(true); uint firstLive = subgroupBallotFindLSB(activeMask); return gl_SubgroupInvocationID == firstLive; } #endif #ifndef GL_KHR_shader_subgroup_ballot uint subgroupBallotBitCount(uvec4 value) { ivec2 c = bitCount(value.xy); #ifdef GL_NV_shader_thread_group return uint(c.x); #else return uint(c.x + c.y); #endif } #endif #ifndef GL_KHR_shader_subgroup_ballot bool subgroupBallotBitExtract(uvec4 value, uint index) { #ifdef GL_NV_shader_thread_group uint shifted = value.x >> index; #else uint shifted = value[index >> 5u] >> (index & 0x1fu); #endif return (shifted & 1u) != 0u; } #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #elif defined(GL_NV_shader_thread_shuffle) float subgroupAdd(float v) { float reduction = 0.0f; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; float s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : 0.0f; } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); float s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : 0.0f; } } return reduction; } vec2 subgroupAdd(vec2 v) { vec2 reduction = vec2(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : vec2(0.0f); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec2 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : vec2(0.0f); } } return reduction; } vec3 subgroupAdd(vec3 v) { vec3 reduction = vec3(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : vec3(0.0f); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec3 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : vec3(0.0f); } } return reduction; } vec4 subgroupAdd(vec4 v) { vec4 reduction = vec4(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : vec4(0.0f); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec4 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : vec4(0.0f); } } return reduction; } double subgroupAdd(double v) { double reduction = 0.0LF; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; double s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : 0.0LF; } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); double s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : 0.0LF; } } return reduction; } dvec2 subgroupAdd(dvec2 v) { dvec2 reduction = dvec2(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec2 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : dvec2(0.0LF); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec2 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : dvec2(0.0LF); } } return reduction; } dvec3 subgroupAdd(dvec3 v) { dvec3 reduction = dvec3(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec3 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : dvec3(0.0LF); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec3 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : dvec3(0.0LF); } } return reduction; } dvec4 subgroupAdd(dvec4 v) { dvec4 reduction = dvec4(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; reduction = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec4 s = shuffleXorNV(reduction, i, gl_SubgroupSize, valid); reduction += valid ? s : dvec4(0.0LF); } } else { for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec4 s = shuffleNV(v, i, gl_SubgroupSize); reduction += valid ? s : dvec4(0.0LF); } } return reduction; } #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #elif defined(GL_NV_shader_thread_shuffle) float subgroupExclusiveAdd(float v) { float excl_scan = 0.0f; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; float s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : 0.0f; } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = 0.0f; } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); float s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : 0.0f; } } return excl_scan; } vec2 subgroupExclusiveAdd(vec2 v) { vec2 excl_scan = vec2(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : vec2(0.0f); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = vec2(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec2 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : vec2(0.0f); } } return excl_scan; } vec3 subgroupExclusiveAdd(vec3 v) { vec3 excl_scan = vec3(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : vec3(0.0f); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = vec3(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec3 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : vec3(0.0f); } } return excl_scan; } vec4 subgroupExclusiveAdd(vec4 v) { vec4 excl_scan = vec4(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : vec4(0.0f); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = vec4(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec4 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : vec4(0.0f); } } return excl_scan; } double subgroupExclusiveAdd(double v) { double excl_scan = 0.0LF; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; double s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : 0.0LF; } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = 0.0LF; } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); double s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : 0.0LF; } } return excl_scan; } dvec2 subgroupExclusiveAdd(dvec2 v) { dvec2 excl_scan = dvec2(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec2 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : dvec2(0.0LF); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = dvec2(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec2 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : dvec2(0.0LF); } } return excl_scan; } dvec3 subgroupExclusiveAdd(dvec3 v) { dvec3 excl_scan = dvec3(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec3 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : dvec3(0.0LF); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = dvec3(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec3 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : dvec3(0.0LF); } } return excl_scan; } dvec4 subgroupExclusiveAdd(dvec4 v) { dvec4 excl_scan = dvec4(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; excl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec4 s = shuffleUpNV(excl_scan, i, gl_SubgroupSize, valid); excl_scan += valid ? s : dvec4(0.0LF); } excl_scan = shuffleUpNV(excl_scan, 1u, gl_SubgroupSize); if (subgroupElect()) { excl_scan = dvec4(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLtMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec4 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); excl_scan += valid ? s : dvec4(0.0LF); } } return excl_scan; } #endif #if defined(GL_KHR_shader_subgroup_arithmetic) #elif defined(GL_NV_shader_thread_shuffle) float subgroupInclusiveAdd(float v) { float incl_scan = 0.0f; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; float s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : 0.0f; } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); float s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : 0.0f; } } return incl_scan; } vec2 subgroupInclusiveAdd(vec2 v) { vec2 incl_scan = vec2(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : vec2(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec2 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : vec2(0.0f); } } return incl_scan; } vec3 subgroupInclusiveAdd(vec3 v) { vec3 incl_scan = vec3(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : vec3(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec3 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : vec3(0.0f); } } return incl_scan; } vec4 subgroupInclusiveAdd(vec4 v) { vec4 incl_scan = vec4(0.0f); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; vec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : vec4(0.0f); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); vec4 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : vec4(0.0f); } } return incl_scan; } double subgroupInclusiveAdd(double v) { double incl_scan = 0.0LF; uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; double s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : 0.0LF; } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); double s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : 0.0LF; } } return incl_scan; } dvec2 subgroupInclusiveAdd(dvec2 v) { dvec2 incl_scan = dvec2(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec2 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : dvec2(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec2 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : dvec2(0.0LF); } } return incl_scan; } dvec3 subgroupInclusiveAdd(dvec3 v) { dvec3 incl_scan = dvec3(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec3 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : dvec3(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec3 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : dvec3(0.0LF); } } return incl_scan; } dvec4 subgroupInclusiveAdd(dvec4 v) { dvec4 incl_scan = dvec4(0.0LF); uvec4 active_threads = subgroupBallot(true); if (subgroupBallotBitCount(active_threads) == gl_SubgroupSize) { uint total = gl_SubgroupSize / 2u; incl_scan = v; for (uint i = 1u; i <= total; i <<= 1u) { bool valid; dvec4 s = shuffleUpNV(incl_scan, i, gl_SubgroupSize, valid); incl_scan += valid ? s : dvec4(0.0LF); } } else { uint total = subgroupBallotBitCount(gl_SubgroupLeMask); for (uint i = 0u; i < gl_SubgroupSize; ++i) { bool valid = subgroupBallotBitExtract(active_threads, i); dvec4 s = shuffleNV(v, i, gl_SubgroupSize); valid = valid && (i < total); incl_scan += valid ? s : dvec4(0.0LF); } } return incl_scan; } #endif void main() { _16.data_out_float = subgroupAdd(_31.data_in_float[gl_LocalInvocationID.x]); _16.data_out_vec2 = subgroupAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); _16.data_out_vec3 = subgroupAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); _16.data_out_vec4 = subgroupAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); _16.data_out_double = subgroupAdd(_31.data_in_double[gl_LocalInvocationID.x]); _16.data_out_dvec2 = subgroupAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); _16.data_out_dvec3 = subgroupAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); _16.data_out_dvec4 = subgroupAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); _16.data_out_float = subgroupExclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); _16.data_out_vec2 = subgroupExclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); _16.data_out_vec3 = subgroupExclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); _16.data_out_vec4 = subgroupExclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); _16.data_out_double = subgroupExclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); _16.data_out_dvec2 = subgroupExclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); _16.data_out_dvec3 = subgroupExclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); _16.data_out_dvec4 = subgroupExclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); _16.data_out_float = subgroupInclusiveAdd(_31.data_in_float[gl_LocalInvocationID.x]); _16.data_out_vec2 = subgroupInclusiveAdd(_31.data_in_vec2[gl_LocalInvocationID.x]); _16.data_out_vec3 = subgroupInclusiveAdd(_31.data_in_vec3[gl_LocalInvocationID.x]); _16.data_out_vec4 = subgroupInclusiveAdd(_31.data_in_vec4[gl_LocalInvocationID.x]); _16.data_out_double = subgroupInclusiveAdd(_31.data_in_double[gl_LocalInvocationID.x]); _16.data_out_dvec2 = subgroupInclusiveAdd(_31.data_in_dvec2[gl_LocalInvocationID.x]); _16.data_out_dvec3 = subgroupInclusiveAdd(_31.data_in_dvec3[gl_LocalInvocationID.x]); _16.data_out_dvec4 = subgroupInclusiveAdd(_31.data_in_dvec4[gl_LocalInvocationID.x]); }