// Copyright 2022 the Vello Authors // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // The binning stage #import config #import drawtag #import bbox #import bump @group(0) @binding(0) var config: Config; @group(0) @binding(1) var draw_monoids: array; @group(0) @binding(2) var path_bbox_buf: array; @group(0) @binding(3) var clip_bbox_buf: array>; @group(0) @binding(4) var intersected_bbox: array>; @group(0) @binding(5) var bump: BumpAllocators; @group(0) @binding(6) var bin_data: array; // TODO: put in common place struct BinHeader { element_count: u32, chunk_offset: u32, } @group(0) @binding(7) var bin_header: array; // conversion factors from coordinates to bin let SX = 0.00390625; let SY = 0.00390625; //let SX = 1.0 / f32(N_TILE_X * TILE_WIDTH); //let SY = 1.0 / f32(N_TILE_Y * TILE_HEIGHT); let WG_SIZE = 256u; let N_SLICE = 8u; //let N_SLICE = WG_SIZE / 32u; let N_SUBSLICE = 4u; var sh_bitmaps: array, N_TILE>, N_SLICE>; // store count values packed two u16's to a u32 var sh_count: array, N_SUBSLICE>; var sh_chunk_offset: array; var sh_previous_failed: u32; @compute @workgroup_size(256) fn main( @builtin(global_invocation_id) global_id: vec3, @builtin(local_invocation_id) local_id: vec3, @builtin(workgroup_id) wg_id: vec3, ) { for (var i = 0u; i < N_SLICE; i += 1u) { atomicStore(&sh_bitmaps[i][local_id.x], 0u); } if local_id.x == 0u { let failed = atomicLoad(&bump.lines) > config.lines_size; sh_previous_failed = u32(failed); } // also functions as barrier to protect zeroing of bitmaps let failed = workgroupUniformLoad(&sh_previous_failed); if failed != 0u { if global_id.x == 0u { atomicOr(&bump.failed, STAGE_FLATTEN); } return; } // Read inputs and determine coverage of bins let element_ix = global_id.x; var x0 = 0; var y0 = 0; var x1 = 0; var y1 = 0; if element_ix < config.n_drawobj { let draw_monoid = draw_monoids[element_ix]; var clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9); if draw_monoid.clip_ix > 0u { // TODO: `clip_ix` should always be valid as long as the monoids are correct. Leaving // the bounds check in here for correctness but we should assert this condition instead // once there is a debug-assertion mechanism. clip_bbox = clip_bbox_buf[min(draw_monoid.clip_ix - 1u, config.n_clip - 1u)]; } // For clip elements, clip_box is the bbox of the clip path, // intersected with enclosing clips. // For other elements, it is the bbox of the enclosing clips. // TODO check this is true let path_bbox = path_bbox_buf[draw_monoid.path_ix]; let pb = vec4(vec4(path_bbox.x0, path_bbox.y0, path_bbox.x1, path_bbox.y1)); let bbox = bbox_intersect(clip_bbox, pb); intersected_bbox[element_ix] = bbox; // `bbox_intersect` can result in a zero or negative area intersection if the path bbox lies // outside the clip bbox. If that is the case, Don't round up the bottom-right corner of the // and leave the coordinates at 0. This way the path will get clipped out and won't get // assigned to a bin. if bbox.x < bbox.z && bbox.y < bbox.w { x0 = i32(floor(bbox.x * SX)); y0 = i32(floor(bbox.y * SY)); x1 = i32(ceil(bbox.z * SX)); y1 = i32(ceil(bbox.w * SY)); } } let width_in_bins = i32((config.width_in_tiles + N_TILE_X - 1u) / N_TILE_X); let height_in_bins = i32((config.height_in_tiles + N_TILE_Y - 1u) / N_TILE_Y); x0 = clamp(x0, 0, width_in_bins); y0 = clamp(y0, 0, height_in_bins); x1 = clamp(x1, 0, width_in_bins); y1 = clamp(y1, 0, height_in_bins); if x0 == x1 { y1 = y0; } var x = x0; var y = y0; let my_slice = local_id.x / 32u; let my_mask = 1u << (local_id.x & 31u); while y < y1 { atomicOr(&sh_bitmaps[my_slice][y * width_in_bins + x], my_mask); x += 1; if x == x1 { x = x0; y += 1; } } workgroupBarrier(); // Allocate output segments var element_count = 0u; for (var i = 0u; i < N_SUBSLICE; i += 1u) { element_count += countOneBits(atomicLoad(&sh_bitmaps[i * 2u][local_id.x])); let element_count_lo = element_count; element_count += countOneBits(atomicLoad(&sh_bitmaps[i * 2u + 1u][local_id.x])); let element_count_hi = element_count; let element_count_packed = element_count_lo | (element_count_hi << 16u); sh_count[i][local_id.x] = element_count_packed; } // element_count is the number of draw objects covering this thread's bin var chunk_offset = atomicAdd(&bump.binning, element_count); if chunk_offset + element_count > config.binning_size { chunk_offset = 0u; atomicOr(&bump.failed, STAGE_BINNING); } sh_chunk_offset[local_id.x] = chunk_offset; bin_header[global_id.x].element_count = element_count; bin_header[global_id.x].chunk_offset = chunk_offset; workgroupBarrier(); // loop over bbox of bins touched by this draw object x = x0; y = y0; while y < y1 { let bin_ix = y * width_in_bins + x; let out_mask = atomicLoad(&sh_bitmaps[my_slice][bin_ix]); // I think this predicate will always be true... if (out_mask & my_mask) != 0u { var idx = countOneBits(out_mask & (my_mask - 1u)); if my_slice > 0u { let count_ix = my_slice - 1u; let count_packed = sh_count[count_ix / 2u][bin_ix]; idx += (count_packed >> (16u * (count_ix & 1u))) & 0xffffu; } let offset = config.bin_data_start + sh_chunk_offset[bin_ix]; bin_data[offset + idx] = element_ix; } x += 1; if x == x1 { x = x0; y += 1; } } }