#pragma clang diagnostic ignored "-Wmissing-prototypes" #include #include using namespace metal; template inline T spvSubgroupBroadcast(T value, ushort lane) { return simd_broadcast(value, lane); } template<> inline bool spvSubgroupBroadcast(bool value, ushort lane) { return !!simd_broadcast((ushort)value, lane); } template inline vec spvSubgroupBroadcast(vec value, ushort lane) { return (vec)simd_broadcast((vec)value, lane); } template inline T spvSubgroupBroadcastFirst(T value) { return simd_broadcast_first(value); } template<> inline bool spvSubgroupBroadcastFirst(bool value) { return !!simd_broadcast_first((ushort)value); } template inline vec spvSubgroupBroadcastFirst(vec value) { return (vec)simd_broadcast_first((vec)value); } inline uint4 spvSubgroupBallot(bool value) { simd_vote vote = simd_ballot(value); // simd_ballot() returns a 64-bit integer-like object, but // SPIR-V callers expect a uint4. We must convert. // FIXME: This won't include higher bits if Apple ever supports // 128 lanes in an SIMD-group. return uint4(as_type((simd_vote::vote_t)vote), 0, 0); } struct inputData { float inputDataArray[1]; }; struct outputData { float outputDataArray[1]; }; constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(64u, 1u, 1u); kernel void main0(device inputData& _12 [[buffer(0)]], device outputData& _87 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint gl_SubgroupInvocationID [[thread_index_in_simdgroup]]) { uint4 gl_SubgroupLtMask = uint4(extract_bits(0xFFFFFFFF, 0, min(gl_SubgroupInvocationID, 32u)), extract_bits(0xFFFFFFFF, 0, (uint)max((int)gl_SubgroupInvocationID - 32, 0)), uint2(0)); bool _31 = _12.inputDataArray[gl_LocalInvocationID.x] > 0.0; uint4 _52 = spvSubgroupBallot(_31); uint4 _66 = uint4(int4(popcount(uint4(as_type(as_type(uint2(gl_SubgroupLtMask.xy))), 0u, 0u) & uint4(as_type(as_type(uint2(_52.xy))), 0u, 0u)))); if (_31) { _87.outputDataArray[_66.x + _66.y] = _12.inputDataArray[gl_LocalInvocationID.x]; } }