#version 450 layout(local_size_x = 256) in; layout(set = 0, binding = 0) readonly buffer InputBuffer { float input_data[]; }; layout(set = 0, binding = 1) buffer OutputBuffer { float output_data[]; }; layout(push_constant) uniform PushConstants { uint input_length; } push; shared float shared_data[256]; void main() { uint gid = gl_GlobalInvocationID.x; uint lid = gl_LocalInvocationID.x; // Load data into shared memory shared_data[lid] = 0.0; if (gid < push.input_length) { shared_data[lid] = input_data[gid]; } barrier(); memoryBarrierShared(); // Reduction in shared memory for (uint stride = 128; stride > 0; stride >>= 1) { if (lid < stride && lid + stride < 256) { shared_data[lid] += shared_data[lid + stride]; } barrier(); memoryBarrierShared(); } // Write result if (lid == 0) { output_data[gl_WorkGroupID.x] = shared_data[0]; } }