static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u); RWByteAddressBuffer _9 : register(u0, space0); static uint4 gl_SubgroupEqMask; static uint4 gl_SubgroupGeMask; static uint4 gl_SubgroupGtMask; static uint4 gl_SubgroupLeMask; static uint4 gl_SubgroupLtMask; void comp_main() { _9.Store(0, asuint(float(WaveGetLaneCount()))); _9.Store(0, asuint(float(WaveGetLaneIndex()))); bool elected = WaveIsFirstLane(); _9.Store(0, asuint(float4(gl_SubgroupEqMask).x)); _9.Store(0, asuint(float4(gl_SubgroupGeMask).x)); _9.Store(0, asuint(float4(gl_SubgroupGtMask).x)); _9.Store(0, asuint(float4(gl_SubgroupLeMask).x)); _9.Store(0, asuint(float4(gl_SubgroupLtMask).x)); float4 broadcasted = WaveReadLaneAt(10.0f.xxxx, 8u); float3 first = WaveReadLaneFirst(20.0f.xxx); uint4 ballot_value = WaveActiveBallot(true); uint bit_count = countbits(ballot_value.x) + countbits(ballot_value.y) + countbits(ballot_value.z) + countbits(ballot_value.w); uint inclusive_bit_count = countbits(ballot_value.x & gl_SubgroupLeMask.x) + countbits(ballot_value.y & gl_SubgroupLeMask.y) + countbits(ballot_value.z & gl_SubgroupLeMask.z) + countbits(ballot_value.w & gl_SubgroupLeMask.w); uint exclusive_bit_count = countbits(ballot_value.x & gl_SubgroupLtMask.x) + countbits(ballot_value.y & gl_SubgroupLtMask.y) + countbits(ballot_value.z & gl_SubgroupLtMask.z) + countbits(ballot_value.w & gl_SubgroupLtMask.w); uint shuffled = WaveReadLaneAt(10u, 8u); uint shuffled_xor = WaveReadLaneAt(30u, WaveGetLaneIndex() ^ 8u); uint shuffled_up = WaveReadLaneAt(20u, WaveGetLaneIndex() - 4u); uint shuffled_down = WaveReadLaneAt(20u, WaveGetLaneIndex() + 4u); bool has_all = WaveActiveAllTrue(true); bool has_any = WaveActiveAnyTrue(true); bool has_equal = WaveActiveAllEqual(true); float4 added = WaveActiveSum(20.0f.xxxx); int4 iadded = WaveActiveSum(int4(20, 20, 20, 20)); float4 multiplied = WaveActiveProduct(20.0f.xxxx); int4 imultiplied = WaveActiveProduct(int4(20, 20, 20, 20)); float4 lo = WaveActiveMin(20.0f.xxxx); float4 hi = WaveActiveMax(20.0f.xxxx); int4 slo = WaveActiveMin(int4(20, 20, 20, 20)); int4 shi = WaveActiveMax(int4(20, 20, 20, 20)); uint4 ulo = WaveActiveMin(uint4(20u, 20u, 20u, 20u)); uint4 uhi = WaveActiveMax(uint4(20u, 20u, 20u, 20u)); uint4 anded = WaveActiveBitAnd(ballot_value); uint4 ored = WaveActiveBitOr(ballot_value); uint4 xored = WaveActiveBitXor(ballot_value); bool4 anded_b = bool4(WaveActiveBitAnd(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w)))); bool4 ored_b = bool4(WaveActiveBitOr(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w)))); bool4 xored_b = bool4(WaveActiveBitXor(uint4(bool4(ballot_value.x == uint4(42u, 42u, 42u, 42u).x, ballot_value.y == uint4(42u, 42u, 42u, 42u).y, ballot_value.z == uint4(42u, 42u, 42u, 42u).z, ballot_value.w == uint4(42u, 42u, 42u, 42u).w)))); added = WavePrefixSum(added) + added; iadded = WavePrefixSum(iadded) + iadded; multiplied = WavePrefixProduct(multiplied) * multiplied; imultiplied = WavePrefixProduct(imultiplied) * imultiplied; added = WavePrefixSum(multiplied); multiplied = WavePrefixProduct(multiplied); iadded = WavePrefixSum(imultiplied); imultiplied = WavePrefixProduct(imultiplied); float4 swap_horiz = QuadReadAcrossX(20.0f.xxxx); float4 swap_vertical = QuadReadAcrossY(20.0f.xxxx); float4 swap_diagonal = QuadReadAcrossDiagonal(20.0f.xxxx); float4 quad_broadcast = QuadReadLaneAt(20.0f.xxxx, 3u); } [numthreads(1, 1, 1)] void main() { gl_SubgroupEqMask = 1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96)); if (WaveGetLaneIndex() >= 32) gl_SubgroupEqMask.x = 0; if (WaveGetLaneIndex() >= 64 || WaveGetLaneIndex() < 32) gl_SubgroupEqMask.y = 0; if (WaveGetLaneIndex() >= 96 || WaveGetLaneIndex() < 64) gl_SubgroupEqMask.z = 0; if (WaveGetLaneIndex() < 96) gl_SubgroupEqMask.w = 0; gl_SubgroupGeMask = ~((1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96))) - 1u); if (WaveGetLaneIndex() >= 32) gl_SubgroupGeMask.x = 0u; if (WaveGetLaneIndex() >= 64) gl_SubgroupGeMask.y = 0u; if (WaveGetLaneIndex() >= 96) gl_SubgroupGeMask.z = 0u; if (WaveGetLaneIndex() < 32) gl_SubgroupGeMask.y = ~0u; if (WaveGetLaneIndex() < 64) gl_SubgroupGeMask.z = ~0u; if (WaveGetLaneIndex() < 96) gl_SubgroupGeMask.w = ~0u; uint gt_lane_index = WaveGetLaneIndex() + 1; gl_SubgroupGtMask = ~((1u << (gt_lane_index - uint4(0, 32, 64, 96))) - 1u); if (gt_lane_index >= 32) gl_SubgroupGtMask.x = 0u; if (gt_lane_index >= 64) gl_SubgroupGtMask.y = 0u; if (gt_lane_index >= 96) gl_SubgroupGtMask.z = 0u; if (gt_lane_index >= 128) gl_SubgroupGtMask.w = 0u; if (gt_lane_index < 32) gl_SubgroupGtMask.y = ~0u; if (gt_lane_index < 64) gl_SubgroupGtMask.z = ~0u; if (gt_lane_index < 96) gl_SubgroupGtMask.w = ~0u; uint le_lane_index = WaveGetLaneIndex() + 1; gl_SubgroupLeMask = (1u << (le_lane_index - uint4(0, 32, 64, 96))) - 1u; if (le_lane_index >= 32) gl_SubgroupLeMask.x = ~0u; if (le_lane_index >= 64) gl_SubgroupLeMask.y = ~0u; if (le_lane_index >= 96) gl_SubgroupLeMask.z = ~0u; if (le_lane_index >= 128) gl_SubgroupLeMask.w = ~0u; if (le_lane_index < 32) gl_SubgroupLeMask.y = 0u; if (le_lane_index < 64) gl_SubgroupLeMask.z = 0u; if (le_lane_index < 96) gl_SubgroupLeMask.w = 0u; gl_SubgroupLtMask = (1u << (WaveGetLaneIndex() - uint4(0, 32, 64, 96))) - 1u; if (WaveGetLaneIndex() >= 32) gl_SubgroupLtMask.x = ~0u; if (WaveGetLaneIndex() >= 64) gl_SubgroupLtMask.y = ~0u; if (WaveGetLaneIndex() >= 96) gl_SubgroupLtMask.z = ~0u; if (WaveGetLaneIndex() < 32) gl_SubgroupLtMask.y = 0u; if (WaveGetLaneIndex() < 64) gl_SubgroupLtMask.z = 0u; if (WaveGetLaneIndex() < 96) gl_SubgroupLtMask.w = 0u; comp_main(); }