diff --git a/gpu/shaders/annotated.h b/gpu/shaders/annotated.h new file mode 100644 index 00000000..2a88ef35 --- /dev/null +++ b/gpu/shaders/annotated.h @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct AnnoFillRef { + uint offset; +}; + +struct AnnoFillTextureRef { + uint offset; +}; + +struct AnnoStrokeRef { + uint offset; +}; + +struct AnnoClipRef { + uint offset; +}; + +struct AnnotatedRef { + uint offset; +}; + +struct AnnoFill { + vec4 bbox; + uint rgba_color; +}; + +#define AnnoFill_size 20 + +AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) { + return AnnoFillRef(ref.offset + index * AnnoFill_size); +} + +struct AnnoFillTexture { + vec4 bbox; + vec4 mat; + vec2 translate; + uvec2 uv_bounds; +}; + +#define AnnoFillTexture_size 48 + +AnnoFillTextureRef AnnoFillTexture_index(AnnoFillTextureRef ref, uint index) { + return AnnoFillTextureRef(ref.offset + index * AnnoFillTexture_size); +} + +struct AnnoStroke { + vec4 bbox; + uint rgba_color; + float linewidth; +}; + +#define AnnoStroke_size 24 + +AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) { + return AnnoStrokeRef(ref.offset + index * AnnoStroke_size); +} + +struct AnnoClip { + vec4 bbox; +}; + +#define AnnoClip_size 16 + +AnnoClipRef AnnoClip_index(AnnoClipRef ref, uint index) { + return AnnoClipRef(ref.offset + index * AnnoClip_size); +} + +#define Annotated_Nop 0 +#define Annotated_Stroke 1 +#define Annotated_Fill 2 +#define Annotated_FillTexture 3 +#define Annotated_BeginClip 4 +#define Annotated_EndClip 5 +#define Annotated_size 52 + +AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) { + return AnnotatedRef(ref.offset + index * Annotated_size); +} + +AnnoFill AnnoFill_read(Alloc a, AnnoFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + AnnoFill s; + s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.rgba_color = raw4; + return s; +} + +void AnnoFill_write(Alloc a, AnnoFillRef ref, AnnoFill s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); + write_mem(a, ix + 4, s.rgba_color); +} + +AnnoFillTexture AnnoFillTexture_read(Alloc a, AnnoFillTextureRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); + uint raw9 = read_mem(a, ix + 9); + uint raw10 = read_mem(a, ix + 10); + uint raw11 = read_mem(a, ix + 11); + AnnoFillTexture s; + s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.mat = vec4(uintBitsToFloat(raw4), uintBitsToFloat(raw5), uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + s.translate = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9)); + s.uv_bounds = uvec2(raw10, raw11); + return s; +} + +void AnnoFillTexture_write(Alloc a, AnnoFillTextureRef ref, AnnoFillTexture s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); + write_mem(a, ix + 4, floatBitsToUint(s.mat.x)); + write_mem(a, ix + 5, floatBitsToUint(s.mat.y)); + write_mem(a, ix + 6, floatBitsToUint(s.mat.z)); + write_mem(a, ix + 7, floatBitsToUint(s.mat.w)); + write_mem(a, ix + 8, floatBitsToUint(s.translate.x)); + write_mem(a, ix + 9, floatBitsToUint(s.translate.y)); + write_mem(a, ix + 10, s.uv_bounds.x); + write_mem(a, ix + 11, s.uv_bounds.y); +} + +AnnoStroke AnnoStroke_read(Alloc a, AnnoStrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + AnnoStroke s; + s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.rgba_color = raw4; + s.linewidth = uintBitsToFloat(raw5); + return s; +} + +void AnnoStroke_write(Alloc a, AnnoStrokeRef ref, AnnoStroke s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); + write_mem(a, ix + 4, s.rgba_color); + write_mem(a, ix + 5, floatBitsToUint(s.linewidth)); +} + +AnnoClip AnnoClip_read(Alloc a, AnnoClipRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + AnnoClip s; + s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void AnnoClip_write(Alloc a, AnnoClipRef ref, AnnoClip s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.bbox.x)); + write_mem(a, ix + 1, floatBitsToUint(s.bbox.y)); + write_mem(a, ix + 2, floatBitsToUint(s.bbox.z)); + write_mem(a, ix + 3, floatBitsToUint(s.bbox.w)); +} + +uint Annotated_tag(Alloc a, AnnotatedRef ref) { + return read_mem(a, ref.offset >> 2); +} + +AnnoStroke Annotated_Stroke_read(Alloc a, AnnotatedRef ref) { + return AnnoStroke_read(a, AnnoStrokeRef(ref.offset + 4)); +} + +AnnoFill Annotated_Fill_read(Alloc a, AnnotatedRef ref) { + return AnnoFill_read(a, AnnoFillRef(ref.offset + 4)); +} + +AnnoFillTexture Annotated_FillTexture_read(Alloc a, AnnotatedRef ref) { + return AnnoFillTexture_read(a, AnnoFillTextureRef(ref.offset + 4)); +} + +AnnoClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) { + return AnnoClip_read(a, AnnoClipRef(ref.offset + 4)); +} + +AnnoClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) { + return AnnoClip_read(a, AnnoClipRef(ref.offset + 4)); +} + +void Annotated_Nop_write(Alloc a, AnnotatedRef ref) { + write_mem(a, ref.offset >> 2, Annotated_Nop); +} + +void Annotated_Stroke_write(Alloc a, AnnotatedRef ref, AnnoStroke s) { + write_mem(a, ref.offset >> 2, Annotated_Stroke); + AnnoStroke_write(a, AnnoStrokeRef(ref.offset + 4), s); +} + +void Annotated_Fill_write(Alloc a, AnnotatedRef ref, AnnoFill s) { + write_mem(a, ref.offset >> 2, Annotated_Fill); + AnnoFill_write(a, AnnoFillRef(ref.offset + 4), s); +} + +void Annotated_FillTexture_write(Alloc a, AnnotatedRef ref, AnnoFillTexture s) { + write_mem(a, ref.offset >> 2, Annotated_FillTexture); + AnnoFillTexture_write(a, AnnoFillTextureRef(ref.offset + 4), s); +} + +void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) { + write_mem(a, ref.offset >> 2, Annotated_BeginClip); + AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s); +} + +void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) { + write_mem(a, ref.offset >> 2, Annotated_EndClip); + AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s); +} + diff --git a/gpu/shaders/backdrop.comp b/gpu/shaders/backdrop.comp new file mode 100644 index 00000000..04a99990 --- /dev/null +++ b/gpu/shaders/backdrop.comp @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Propagation of tile backdrop for filling. +// +// Each thread reads one path element and calculates the number of spanned tiles +// based on the bounding box. +// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel. +// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel, +// and propagated from the left to the right (prefix summed). +// +// Output state: +// - Each path element has an array of tiles covering the whole path based on boundig box +// - Each tile per path element contains the 'backdrop' and a list of subdivided path segments + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define LG_BACKDROP_WG (7 + LG_WG_FACTOR) +#define BACKDROP_WG (1 << LG_BACKDROP_WG) + +layout(local_size_x = BACKDROP_WG, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" +#include "tile.h" + +shared uint sh_row_count[BACKDROP_WG]; +shared Alloc sh_row_alloc[BACKDROP_WG]; +shared uint sh_row_width[BACKDROP_WG]; + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + + // Work assignment: 1 thread : 1 path element + uint row_count = 0; + if (element_ix < conf.n_elements) { + uint tag = Annotated_tag(conf.anno_alloc, ref); + switch (tag) { + case Annotated_Fill: + case Annotated_FillTexture: + case Annotated_BeginClip: + PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); + Path path = Path_read(conf.tile_alloc, path_ref); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + // Paths that don't cross tile top edges don't have backdrops. + // Don't apply the optimization to paths that may cross the y = 0 + // top edge, but clipped to 1 row. + if (row_count == 1 && path.bbox.y > 0) { + // Note: this can probably be expanded to width = 2 as + // long as it doesn't cross the left edge. + row_count = 0; + } + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); + sh_row_alloc[th_ix] = path_alloc; + } + } + + sh_row_count[th_ix] = row_count; + // Prefix sum of sh_row_count + for (uint i = 0; i < LG_BACKDROP_WG; i++) { + barrier(); + if (th_ix >= (1 << i)) { + row_count += sh_row_count[th_ix - (1 << i)]; + } + barrier(); + sh_row_count[th_ix] = row_count; + } + barrier(); + // Work assignment: 1 thread : 1 path element row + uint total_rows = sh_row_count[BACKDROP_WG - 1]; + for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) { + // Binary search to find element + uint el_ix = 0; + for (uint i = 0; i < LG_BACKDROP_WG; i++) { + uint probe = el_ix + ((BACKDROP_WG / 2) >> i); + if (row >= sh_row_count[probe - 1]) { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if (width > 0) { + // Process one row sequentially + // Read backdrop value per tile and prefix sum it + Alloc tiles_alloc = sh_row_alloc[el_ix]; + uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0); + uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width; + uint sum = read_mem(tiles_alloc, tile_el_ix); + for (uint x = 1; x < width; x++) { + tile_el_ix += 2; + sum += read_mem(tiles_alloc, tile_el_ix); + write_mem(tiles_alloc, tile_el_ix, sum); + } + } + } +} diff --git a/gpu/shaders/binning.comp b/gpu/shaders/binning.comp new file mode 100644 index 00000000..4c78cd24 --- /dev/null +++ b/gpu/shaders/binning.comp @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The binning stage of the pipeline. +// +// Each workgroup processes N_TILE paths. +// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask +// based on the path bounding box to bin the paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" +#include "bins.h" + +// scale factors useful for converting coordinates to bins +#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) +#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) + +// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) +#define INFINITY (1.0 / 0.0) + +// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. +// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) +shared uint bitmaps[N_SLICE][N_TILE]; +shared uint count[N_SLICE][N_TILE]; +shared Alloc sh_chunk_alloc[N_TILE]; +shared bool sh_alloc_failed; + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + uint my_n_elements = conf.n_elements; + uint my_partition = gl_WorkGroupID.x; + + for (uint i = 0; i < N_SLICE; i++) { + bitmaps[i][gl_LocalInvocationID.x] = 0; + } + if (gl_LocalInvocationID.x == 0) { + sh_alloc_failed = false; + } + barrier(); + + // Read inputs and determine coverage of bins + uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + uint tag = Annotated_Nop; + if (element_ix < my_n_elements) { + tag = Annotated_tag(conf.anno_alloc, ref); + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + switch (tag) { + case Annotated_Fill: + case Annotated_FillTexture: + case Annotated_Stroke: + case Annotated_BeginClip: + case Annotated_EndClip: + // Note: we take advantage of the fact that these drawing elements + // have the bbox at the same place in their layout. + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + break; + } + + // At this point, we run an iterator over the coverage area, + // trying to keep divergence low. + // Right now, it's just a bbox, but we'll get finer with + // segments. + uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; + uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) y1 = y0; + int x = x0, y = y0; + uint my_slice = gl_LocalInvocationID.x / 32; + uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + while (y < y1) { + atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); + x++; + if (x == x1) { + x = x0; + y++; + } + } + + barrier(); + // Allocate output segments. + uint element_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + count[i][gl_LocalInvocationID.x] = element_count; + } + // element_count is number of elements covering bin for this invocation. + Alloc chunk_alloc = new_alloc(0, 0); + if (element_count != 0) { + // TODO: aggregate atomic adds (subgroup is probably fastest) + MallocResult chunk = malloc(element_count * BinInstance_size); + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) { + sh_alloc_failed = true; + } + } + // Note: it might be more efficient for reading to do this in the + // other order (each bin is a contiguous sequence of partitions) + uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + write_mem(conf.bin_alloc, out_ix, element_count); + write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); + + barrier(); + if (sh_alloc_failed) { + return; + } + + // Use similar strategy as Laine & Karras paper; loop over bbox of bins + // touched by this element + x = x0; + y = y0; + while (y < y1) { + uint bin_ix = y * width_in_bins + x; + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0) { + uint idx = bitCount(out_mask & (my_mask - 1)); + if (my_slice > 0) { + idx += count[my_slice - 1][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + idx * BinInstance_size; + BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); + } + x++; + if (x == x1) { + x = x0; + y++; + } + } +} diff --git a/gpu/shaders/bins.h b/gpu/shaders/bins.h new file mode 100644 index 00000000..853adabe --- /dev/null +++ b/gpu/shaders/bins.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct BinInstanceRef { + uint offset; +}; + +struct BinInstance { + uint element_ix; +}; + +#define BinInstance_size 4 + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { + return BinInstanceRef(ref.offset + index * BinInstance_size); +} + +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + BinInstance s; + s.element_ix = raw0; + return s; +} + +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.element_ix); +} + diff --git a/gpu/shaders/coarse.comp b/gpu/shaders/coarse.comp new file mode 100644 index 00000000..cbc69307 --- /dev/null +++ b/gpu/shaders/coarse.comp @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The coarse rasterizer stage of the pipeline. +// +// As input we have the ordered partitions of paths from the binning phase and +// the annotated tile list of segments and backdrop per path. +// +// Each workgroup operating on one bin by stream compacting +// the elements corresponding to the bin. +// +// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +layout(local_size_x = N_TILE, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" +#include "bins.h" +#include "tile.h" +#include "ptcl.h" + +#define LG_N_PART_READ (7 + LG_WG_FACTOR) +#define N_PART_READ (1 << LG_N_PART_READ) + +shared uint sh_elements[N_TILE]; + +// Number of elements in the partition; prefix sum. +shared uint sh_part_count[N_PART_READ]; +shared Alloc sh_part_elements[N_PART_READ]; + +shared uint sh_bitmaps[N_SLICE][N_TILE]; + +shared uint sh_tile_count[N_TILE]; +// The width of the tile rect for the element, intersected with this bin +shared uint sh_tile_width[N_TILE]; +shared uint sh_tile_x0[N_TILE]; +shared uint sh_tile_y0[N_TILE]; + +// These are set up so base + tile_y * stride + tile_x points to a Tile. +shared uint sh_tile_base[N_TILE]; +shared uint sh_tile_stride[N_TILE]; + +#ifdef MEM_DEBUG +// Store allocs only when MEM_DEBUG to save shared memory traffic. +shared Alloc sh_tile_alloc[N_TILE]; + +void write_tile_alloc(uint el_ix, Alloc a) { + sh_tile_alloc[el_ix] = a; +} + +Alloc read_tile_alloc(uint el_ix) { + return sh_tile_alloc[el_ix]; +} +#else +void write_tile_alloc(uint el_ix, Alloc a) { + // No-op +} + +Alloc read_tile_alloc(uint el_ix) { + // All memory. + return new_alloc(0, memory.length()*4); +} +#endif + +// Perhaps cmd_alloc should be a global? This is a style question. +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { + if (cmd_ref.offset < cmd_limit) { + return true; + } + MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC); + if (new_cmd.failed) { + return false; + } + CmdJump jump = CmdJump(new_cmd.alloc.offset); + Cmd_Jump_write(cmd_alloc, cmd_ref, jump); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef(cmd_alloc.offset); + cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + return true; +} + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + // Could use either linear or 2d layouts for both dispatch and + // invocations within the workgroup. We'll use variables to abstract. + uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; + uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x; + uint partition_ix = 0; + uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE; + uint th_ix = gl_LocalInvocationID.x; + + // Coordinates of top left of bin, in tiles. + uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x; + uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y; + + // Per-tile state + uint tile_x = gl_LocalInvocationID.x % N_TILE_X; + uint tile_y = gl_LocalInvocationID.x / N_TILE_X; + uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x; + Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(cmd_alloc.offset); + uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + // The nesting depth of the clip stack + uint clip_depth = 0; + // State for the "clip zero" optimization. If it's nonzero, then we are + // currently in a clip for which the entire tile has an alpha of zero, and + // the value is the depth after the "begin clip" of that element. + uint clip_zero_depth = 0; + // State for the "clip one" optimization. If bit `i` is set, then that means + // that the clip pushed at depth `i` has an alpha of all one. + uint clip_one_mask = 0; + + // I'm sure we can figure out how to do this with at least one fewer register... + // Items up to rd_ix have been read from sh_elements + uint rd_ix = 0; + // Items up to wr_ix have been written into sh_elements + uint wr_ix = 0; + // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements + uint part_start_ix = 0; + uint ready_ix = 0; + + while (true) { + for (uint i = 0; i < N_SLICE; i++) { + sh_bitmaps[i][th_ix] = 0; + } + + // parallel read of input partitions + do { + if (ready_ix == wr_ix && partition_ix < n_partitions) { + part_start_ix = ready_ix; + uint count = 0; + if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { + uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; + count = read_mem(conf.bin_alloc, in_ix); + uint offset = read_mem(conf.bin_alloc, in_ix + 1); + sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size); + } + // prefix sum of counts + for (uint i = 0; i < LG_N_PART_READ; i++) { + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = count; + } + barrier(); + if (th_ix < N_PART_READ) { + if (th_ix >= (1 << i)) { + count += sh_part_count[th_ix - (1 << i)]; + } + } + barrier(); + } + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = part_start_ix + count; + } + barrier(); + ready_ix = sh_part_count[N_PART_READ - 1]; + partition_ix += N_PART_READ; + } + // use binary search to find element to read + uint ix = rd_ix + th_ix; + if (ix >= wr_ix && ix < ready_ix) { + uint part_ix = 0; + for (uint i = 0; i < LG_N_PART_READ; i++) { + uint probe = part_ix + ((N_PART_READ / 2) >> i); + if (ix >= sh_part_count[probe - 1]) { + part_ix = probe; + } + } + ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset); + BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix)); + sh_elements[th_ix] = inst.element_ix; + } + barrier(); + + wr_ix = min(rd_ix + N_TILE, ready_ix); + } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions)); + + // We've done the merge and filled the buffer. + + // Read one element, compute coverage. + uint tag = Annotated_Nop; + uint element_ix; + AnnotatedRef ref; + if (th_ix + rd_ix < wr_ix) { + element_ix = sh_elements[th_ix]; + ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + tag = Annotated_tag(conf.anno_alloc, ref); + } + + // Bounding box of element in pixel coordinates. + uint tile_count; + switch (tag) { + case Annotated_Fill: + case Annotated_FillTexture: + case Annotated_Stroke: + case Annotated_BeginClip: + case Annotated_EndClip: + // We have one "path" for each element, even if the element isn't + // actually a path (currently EndClip, but images etc in the future). + uint path_ix = element_ix; + Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, N_TILE_X); + int y0 = clamp(dy, 0, N_TILE_Y); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = x0; + sh_tile_y0[th_ix] = y0; + tile_count = uint(x1 - x0) * uint(y1 - y0); + // base relative to bin + uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; + sh_tile_base[th_ix] = base; + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); + write_tile_alloc(th_ix, path_alloc); + break; + default: + tile_count = 0; + break; + } + + // Prefix sum of sh_tile_count + sh_tile_count[th_ix] = tile_count; + for (uint i = 0; i < LG_N_TILE; i++) { + barrier(); + if (th_ix >= (1 << i)) { + tile_count += sh_tile_count[th_ix - (1 << i)]; + } + barrier(); + sh_tile_count[th_ix] = tile_count; + } + barrier(); + uint total_tile_count = sh_tile_count[N_TILE - 1]; + for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) { + // Binary search to find element + uint el_ix = 0; + for (uint i = 0; i < LG_N_TILE; i++) { + uint probe = el_ix + ((N_TILE / 2) >> i); + if (ix >= sh_tile_count[probe - 1]) { + el_ix = probe; + } + } + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size); + uint tag = Annotated_tag(conf.anno_alloc, ref); + uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + seq_ix % width; + uint y = sh_tile_y0[el_ix] + seq_ix / width; + bool include_tile; + if (tag == Annotated_BeginClip || tag == Annotated_EndClip) { + include_tile = true; + } else { + Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); + // Include the path in the tile if + // - the tile contains at least a segment (tile offset non-zero) + // - the tile is completely covered (backdrop non-zero) + include_tile = tile.tile.offset != 0 || tile.backdrop != 0; + } + if (include_tile) { + uint el_slice = el_ix / 32; + uint el_mask = 1 << (el_ix & 31); + atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); + } + } + + barrier(); + + // Output non-segment elements for this tile. The thread does a sequential walk + // through the non-segment elements. + uint slice_ix = 0; + uint bitmap = sh_bitmaps[0][th_ix]; + while (true) { + if (bitmap == 0) { + slice_ix++; + if (slice_ix == N_SLICE) { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0) { + continue; + } + } + uint element_ref_ix = slice_ix * 32 + findLSB(bitmap); + uint element_ix = sh_elements[element_ref_ix]; + + // Clear LSB + bitmap &= bitmap - 1; + + // At this point, we read the element again from global memory. + // If that turns out to be expensive, maybe we can pack it into + // shared memory (or perhaps just the tag). + ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + tag = Annotated_tag(conf.anno_alloc, ref); + + if (clip_zero_depth == 0) { + switch (tag) { + case Annotated_Fill: + Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; + } + if (tile.tile.offset != 0) { + CmdFill cmd_fill; + cmd_fill.tile_ref = tile.tile.offset; + cmd_fill.backdrop = tile.backdrop; + cmd_fill.rgba_color = fill.rgba_color; + Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill); + } else { + Cmd_Solid_write(cmd_alloc, cmd_ref, CmdSolid(fill.rgba_color)); + } + cmd_ref.offset += Cmd_size; + break; + case Annotated_FillTexture: + tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoFillTexture fill_tex = Annotated_FillTexture_read(conf.anno_alloc, ref); + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; + } + if (tile.tile.offset != 0) { + CmdFillTexture cmd_fill_tex; + cmd_fill_tex.tile_ref = tile.tile.offset; + cmd_fill_tex.backdrop = tile.backdrop; + cmd_fill_tex.mat = fill_tex.mat; + cmd_fill_tex.translate = fill_tex.translate; + cmd_fill_tex.uv_bounds = fill_tex.uv_bounds; + Cmd_FillTexture_write(cmd_alloc, cmd_ref, cmd_fill_tex); + } else { + CmdSolidTexture cmd_solid_tex; + cmd_solid_tex.mat = fill_tex.mat; + cmd_solid_tex.translate = fill_tex.translate; + cmd_solid_tex.uv_bounds = fill_tex.uv_bounds; + Cmd_SolidTexture_write(cmd_alloc, cmd_ref, cmd_solid_tex); + } + cmd_ref.offset += Cmd_size; + break; + case Annotated_BeginClip: + tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + if (tile.tile.offset == 0 && tile.backdrop == 0) { + clip_zero_depth = clip_depth + 1; + } else if (tile.tile.offset == 0 && clip_depth < 32) { + clip_one_mask |= (1 << clip_depth); + } else { + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; + } + if (tile.tile.offset != 0) { + CmdBeginClip cmd_begin_clip; + cmd_begin_clip.tile_ref = tile.tile.offset; + cmd_begin_clip.backdrop = tile.backdrop; + Cmd_BeginClip_write(cmd_alloc, cmd_ref, cmd_begin_clip); + } else { + // TODO: here is where a bunch of optimization magic should happen + float alpha = tile.backdrop == 0 ? 0.0 : 1.0; + Cmd_BeginSolidClip_write(cmd_alloc, cmd_ref, CmdBeginSolidClip(alpha)); + } + cmd_ref.offset += Cmd_size; + if (clip_depth < 32) { + clip_one_mask &= ~(1 << clip_depth); + } + } + clip_depth++; + break; + case Annotated_EndClip: + clip_depth--; + if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; + } + Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(1.0)); + cmd_ref.offset += Cmd_size; + } + break; + case Annotated_Stroke: + tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); + AnnoStroke stroke = Annotated_Stroke_read(conf.anno_alloc, ref); + CmdStroke cmd_stroke; + cmd_stroke.tile_ref = tile.tile.offset; + cmd_stroke.half_width = 0.5 * stroke.linewidth; + cmd_stroke.rgba_color = stroke.rgba_color; + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; + } + Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke); + cmd_ref.offset += Cmd_size; + break; + } + } else { + // In "clip zero" state, suppress all drawing + switch (tag) { + case Annotated_BeginClip: + clip_depth++; + break; + case Annotated_EndClip: + if (clip_depth == clip_zero_depth) { + clip_zero_depth = 0; + } + clip_depth--; + break; + } + } + } + barrier(); + + rd_ix += N_TILE; + if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; + } + if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { + Cmd_End_write(cmd_alloc, cmd_ref); + } +} diff --git a/gpu/shaders/elements.comp b/gpu/shaders/elements.comp new file mode 100644 index 00000000..a43c270f --- /dev/null +++ b/gpu/shaders/elements.comp @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The element processing stage, first in the pipeline. +// +// This stage is primarily about applying transforms and computing bounding +// boxes. It is organized as a scan over the input elements, producing +// annotated output elements. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define N_ROWS 4 +#define WG_SIZE 32 +#define LG_WG_SIZE 5 +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(set = 0, binding = 2) readonly buffer SceneBuf { + uint[] scene; +}; + +// It would be better to use the Vulkan memory model than +// "volatile" but shooting for compatibility here rather +// than doing things right. +layout(set = 0, binding = 3) volatile buffer StateBuf { + uint part_counter; + uint[] state; +}; + +#include "scene.h" +#include "state.h" +#include "annotated.h" +#include "pathseg.h" + +#define StateBuf_stride (4 + 2 * State_size) + +StateRef state_aggregate_ref(uint partition_ix) { + return StateRef(4 + partition_ix * StateBuf_stride); +} + +StateRef state_prefix_ref(uint partition_ix) { + return StateRef(4 + partition_ix * StateBuf_stride + State_size); +} + +uint state_flag_index(uint partition_ix) { + return partition_ix * (StateBuf_stride / 4); +} + +// These correspond to X, A, P respectively in the prefix sum paper. +#define FLAG_NOT_READY 0 +#define FLAG_AGGREGATE_READY 1 +#define FLAG_PREFIX_READY 2 + +#define FLAG_SET_LINEWIDTH 1 +#define FLAG_SET_BBOX 2 +#define FLAG_RESET_BBOX 4 + +// This is almost like a monoid (the interaction between transformation and +// bounding boxes is approximate) +State combine_state(State a, State b) { + State c; + c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; + c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; + if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { + c.bbox = a.bbox; + } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && + (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) + { + c.bbox.xy = min(a.bbox.xy, c.bbox.xy); + c.bbox.zw = max(a.bbox.zw, c.bbox.zw); + } + // It would be more concise to cast to matrix types; ah well. + c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; + c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; + c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; + c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; + c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; + c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; + c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; + c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; + c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; + c.path_count = a.path_count + b.path_count; + c.pathseg_count = a.pathseg_count + b.pathseg_count; + return c; +} + +State map_element(ElementRef ref) { + // TODO: it would *probably* be more efficient to make the memory read patterns less + // divergent, though it would be more wasted memory. + uint tag = Element_tag(ref); + State c; + c.bbox = vec4(0.0, 0.0, 0.0, 0.0); + c.mat = vec4(1.0, 0.0, 0.0, 1.0); + c.translate = vec2(0.0, 0.0); + c.linewidth = 1.0; // TODO should be 0.0 + c.flags = 0; + c.path_count = 0; + c.pathseg_count = 0; + switch (tag) { + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_FillLine_read(ref); + c.bbox.xy = min(line.p0, line.p1); + c.bbox.zw = max(line.p0, line.p1); + c.pathseg_count = 1; + break; + case Element_FillQuad: + case Element_StrokeQuad: + QuadSeg quad = Element_FillQuad_read(ref); + c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); + c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); + c.pathseg_count = 1; + break; + case Element_FillCubic: + case Element_StrokeCubic: + CubicSeg cubic = Element_FillCubic_read(ref); + c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); + c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); + c.pathseg_count = 1; + break; + case Element_Fill: + case Element_FillTexture: + case Element_Stroke: + case Element_BeginClip: + c.flags = FLAG_RESET_BBOX; + c.path_count = 1; + break; + case Element_EndClip: + c.path_count = 1; + break; + case Element_SetLineWidth: + SetLineWidth lw = Element_SetLineWidth_read(ref); + c.linewidth = lw.width; + c.flags = FLAG_SET_LINEWIDTH; + break; + case Element_Transform: + Transform t = Element_Transform_read(ref); + c.mat = t.mat; + c.translate = t.translate; + break; + } + return c; +} + +// Get the bounding box of a circle transformed by the matrix into an ellipse. +vec2 get_linewidth(State st) { + // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm + return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); +} + +// We should be able to use an array of structs but the NV shader compiler +// doesn't seem to like it :/ +//shared State sh_state[WG_SIZE]; +shared vec4 sh_mat[WG_SIZE]; +shared vec2 sh_translate[WG_SIZE]; +shared vec4 sh_bbox[WG_SIZE]; +shared float sh_width[WG_SIZE]; +shared uint sh_flags[WG_SIZE]; +shared uint sh_path_count[WG_SIZE]; +shared uint sh_pathseg_count[WG_SIZE]; + +shared uint sh_part_ix; +shared State sh_prefix; + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + State th_state[N_ROWS]; + // Determine partition to process by atomic counter (described in Section + // 4.4 of prefix sum paper). + if (gl_LocalInvocationID.x == 0) { + sh_part_ix = atomicAdd(part_counter, 1); + } + barrier(); + uint part_ix = sh_part_ix; + + uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; + ElementRef ref = ElementRef(ix * Element_size); + + th_state[0] = map_element(ref); + for (uint i = 1; i < N_ROWS; i++) { + // discussion question: would it be faster to load using more coherent patterns + // into thread memory? This is kinda strided. + th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); + } + State agg = th_state[N_ROWS - 1]; + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + sh_path_count[gl_LocalInvocationID.x] = agg.path_count; + sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1 << i)) { + State other; + uint ix = gl_LocalInvocationID.x - (1 << i); + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + other.path_count = sh_path_count[ix]; + other.pathseg_count = sh_pathseg_count[ix]; + agg = combine_state(other, agg); + } + barrier(); + sh_mat[gl_LocalInvocationID.x] = agg.mat; + sh_translate[gl_LocalInvocationID.x] = agg.translate; + sh_bbox[gl_LocalInvocationID.x] = agg.bbox; + sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_flags[gl_LocalInvocationID.x] = agg.flags; + sh_path_count[gl_LocalInvocationID.x] = agg.path_count; + sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; + } + + State exclusive; + exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); + exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); + exclusive.translate = vec2(0.0, 0.0); + exclusive.linewidth = 1.0; //TODO should be 0.0 + exclusive.flags = 0; + exclusive.path_count = 0; + exclusive.pathseg_count = 0; + + // Publish aggregate for this partition + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + // Note: with memory model, we'd want to generate the atomic store version of this. + State_write(state_aggregate_ref(part_ix), agg); + uint flag = FLAG_AGGREGATE_READY; + memoryBarrierBuffer(); + if (part_ix == 0) { + State_write(state_prefix_ref(part_ix), agg); + flag = FLAG_PREFIX_READY; + } + state[state_flag_index(part_ix)] = flag; + if (part_ix != 0) { + // step 4 of paper: decoupled lookback + uint look_back_ix = part_ix - 1; + + State their_agg; + uint their_ix = 0; + while (true) { + flag = state[state_flag_index(look_back_ix)]; + if (flag == FLAG_PREFIX_READY) { + State their_prefix = State_read(state_prefix_ref(look_back_ix)); + exclusive = combine_state(their_prefix, exclusive); + break; + } else if (flag == FLAG_AGGREGATE_READY) { + their_agg = State_read(state_aggregate_ref(look_back_ix)); + exclusive = combine_state(their_agg, exclusive); + look_back_ix--; + their_ix = 0; + continue; + } + // else spin + + // Unfortunately there's no guarantee of forward progress of other + // workgroups, so compute a bit of the aggregate before trying again. + // In the worst case, spinning stops when the aggregate is complete. + ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); + State s = map_element(ref); + if (their_ix == 0) { + their_agg = s; + } else { + their_agg = combine_state(their_agg, s); + } + their_ix++; + if (their_ix == PARTITION_SIZE) { + exclusive = combine_state(their_agg, exclusive); + if (look_back_ix == 0) { + break; + } + look_back_ix--; + their_ix = 0; + } + } + + // step 5 of paper: compute inclusive prefix + State inclusive_prefix = combine_state(exclusive, agg); + sh_prefix = exclusive; + State_write(state_prefix_ref(part_ix), inclusive_prefix); + memoryBarrierBuffer(); + flag = FLAG_PREFIX_READY; + state[state_flag_index(part_ix)] = flag; + } + } + barrier(); + if (part_ix != 0) { + exclusive = sh_prefix; + } + + State row = exclusive; + if (gl_LocalInvocationID.x > 0) { + uint ix = gl_LocalInvocationID.x - 1; + State other; + other.mat = sh_mat[ix]; + other.translate = sh_translate[ix]; + other.bbox = sh_bbox[ix]; + other.linewidth = sh_width[ix]; + other.flags = sh_flags[ix]; + other.path_count = sh_path_count[ix]; + other.pathseg_count = sh_pathseg_count[ix]; + row = combine_state(row, other); + } + for (uint i = 0; i < N_ROWS; i++) { + State st = combine_state(row, th_state[i]); + + // Here we read again from the original scene. There may be + // gains to be had from stashing in shared memory or possibly + // registers (though register pressure is an issue). + ElementRef this_ref = Element_index(ref, i); + uint tag = Element_tag(this_ref); + switch (tag) { + case Element_FillLine: + case Element_StrokeLine: + LineSeg line = Element_StrokeLine_read(this_ref); + vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate; + vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate; + PathStrokeCubic path_cubic; + path_cubic.p0 = p0; + path_cubic.p1 = mix(p0, p1, 1.0 / 3.0); + path_cubic.p2 = mix(p1, p0, 1.0 / 3.0); + path_cubic.p3 = p1; + path_cubic.path_ix = st.path_count; + if (tag == Element_StrokeLine) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + // We do encoding a bit by hand to minimize divergence. Another approach + // would be to have a fill/stroke bool. + PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic; + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + break; + case Element_FillQuad: + case Element_StrokeQuad: + QuadSeg quad = Element_StrokeQuad_read(this_ref); + p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate; + p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate; + vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate; + path_cubic; + path_cubic.p0 = p0; + path_cubic.p1 = mix(p1, p0, 1.0 / 3.0); + path_cubic.p2 = mix(p1, p2, 1.0 / 3.0); + path_cubic.p3 = p2; + path_cubic.path_ix = st.path_count; + if (tag == Element_StrokeQuad) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + // We do encoding a bit by hand to minimize divergence. Another approach + // would be to have a fill/stroke bool. + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic; + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + break; + case Element_FillCubic: + case Element_StrokeCubic: + CubicSeg cubic = Element_StrokeCubic_read(this_ref); + path_cubic; + path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate; + path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate; + path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate; + path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate; + path_cubic.path_ix = st.path_count; + if (tag == Element_StrokeCubic) { + path_cubic.stroke = get_linewidth(st); + } else { + path_cubic.stroke = vec2(0.0); + } + // We do encoding a bit by hand to minimize divergence. Another approach + // would be to have a fill/stroke bool. + path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); + out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic; + write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag); + PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); + break; + case Element_Stroke: + Stroke stroke = Element_Stroke_read(this_ref); + AnnoStroke anno_stroke; + anno_stroke.rgba_color = stroke.rgba_color; + vec2 lw = get_linewidth(st); + anno_stroke.bbox = st.bbox + vec4(-lw, lw); + anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); + AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Stroke_write(conf.anno_alloc, out_ref, anno_stroke); + break; + case Element_Fill: + Fill fill = Element_Fill_read(this_ref); + AnnoFill anno_fill; + anno_fill.rgba_color = fill.rgba_color; + anno_fill.bbox = st.bbox; + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_Fill_write(conf.anno_alloc, out_ref, anno_fill); + break; + case Element_FillTexture: + FillTexture fill_tex = Element_FillTexture_read(this_ref); + AnnoFillTexture anno_fill_tex; + anno_fill_tex.uv_bounds = fill_tex.uv_bounds; + anno_fill_tex.bbox = st.bbox; + anno_fill_tex.mat = st.mat; + anno_fill_tex.translate = st.translate; + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_FillTexture_write(conf.anno_alloc, out_ref, anno_fill_tex); + break; + case Element_BeginClip: + Clip begin_clip = Element_BeginClip_read(this_ref); + AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox); + // This is the absolute bbox, it's been transformed during encoding. + anno_begin_clip.bbox = begin_clip.bbox; + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_BeginClip_write(conf.anno_alloc, out_ref, anno_begin_clip); + break; + case Element_EndClip: + Clip end_clip = Element_EndClip_read(this_ref); + // This bbox is expected to be the same as the begin one. + AnnoClip anno_end_clip = AnnoClip(end_clip.bbox); + out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); + Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); + break; + } + } +} diff --git a/gpu/shaders/kernel4.comp b/gpu/shaders/kernel4.comp new file mode 100644 index 00000000..d5b44d1f --- /dev/null +++ b/gpu/shaders/kernel4.comp @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// This is "kernel 4" in a 4-kernel pipeline. It renders the commands +// in the per-tile command list to an image. + +// Right now, this kernel stores the image in a buffer, but a better +// plan is to use a texture. This is because of limited support. + +#version 450 +#extension GL_GOOGLE_include_directive : enable +#ifdef VULKAN +#extension GL_EXT_nonuniform_qualifier : enable +#endif + +#include "mem.h" +#include "setup.h" + +#define CHUNK 8 +#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK) +layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; + +#ifdef VULKAN +layout(set = 0, binding = 3) uniform sampler2D textures[]; +#else +layout(set = 0, binding = 3) uniform sampler2D atlas; +#endif + +#include "ptcl.h" +#include "tile.h" + +#define BLEND_STACK_SIZE 4 + +// Layout of a clip scratch frame: +// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference. + +// Link offset and frame size in 32-bit words. +#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX) +#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1) + +shared MallocResult sh_clip_alloc; + +// Allocate a scratch buffer for clipping. +MallocResult alloc_clip_buf(uint link) { + if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) { + MallocResult m = malloc(CLIP_BUF_SIZE * 4); + if (!m.failed) { + write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link); + } + sh_clip_alloc = m; + } + barrier(); + return sh_clip_alloc; +} + +// Calculate coverage based on backdrop + coverage of each line segment +float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { + // Probably better to store as float, but conversion is no doubt cheap. + float area[CHUNK]; + for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop); + TileSegRef tile_seg_ref = TileSegRef(tile_ref); + do { + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref); + for (uint k = 0; k < CHUNK; k++) { + vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY)); + vec2 start = seg.origin - my_xy; + vec2 end = start + seg.vector; + vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0); + if (window.x != window.y) { + vec2 t = (window - start.y) / seg.vector.y; + vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y)); + float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0); + float c = max(b, 0.0); + float d = max(xmin, 0.0); + float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + area[k] += a * (window.x - window.y); + } + area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0); + for (uint k = 0; k < CHUNK; k++) { + area[k] = min(abs(area[k]), 1.0); + } + return area; +} + +vec4[CHUNK] fillTexture(vec2 xy, CmdSolidTexture cmd_tex) { + vec2 uvmin = unpackUnorm2x16(cmd_tex.uv_bounds.x); + vec2 uvmax = unpackUnorm2x16(cmd_tex.uv_bounds.y); + vec4 rgba[CHUNK]; + for (uint i = 0; i < CHUNK; i++) { + float dy = float(i * CHUNK_DY); + vec2 uv = vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5); + uv = cmd_tex.mat.xy * uv.x + cmd_tex.mat.zw * uv.y + cmd_tex.translate; + uv = clamp(uv, uvmin, uvmax); +#ifdef VULKAN + vec4 fg_rgba = textureGrad(textures[0], uv, cmd_tex.mat.xy, cmd_tex.mat.zw); +#else + vec4 fg_rgba = textureGrad(atlas, uv, cmd_tex.mat.xy, cmd_tex.mat.zw); +#endif + rgba[i] = fg_rgba; + } + return rgba; +} + +vec3 tosRGB(vec3 rgb) { + bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308)); + vec3 below = vec3(12.92)*rgb; + vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055); + return mix(below, above, cutoff); +} + +// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color +// space. +vec4 unpacksRGB(uint srgba) { + vec4 color = unpackUnorm4x8(srgba).wzyx; + // Formula from EXT_sRGB. + vec3 rgb = color.rgb; + bvec3 cutoff = greaterThanEqual(rgb, vec3(0.04045)); + vec3 below = rgb/vec3(12.92); + vec3 above = pow((rgb + vec3(0.055))/vec3(1.055), vec3(2.4)); + rgb = mix(below, above, cutoff); + return vec4(rgb, color.a); +} + +// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent. +uint packsRGB(vec4 rgba) { + rgba = vec4(tosRGB(rgba.rgb), rgba.a); + return packUnorm4x8(rgba.wzyx); +} + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x; + Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(cmd_alloc.offset); + + uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); + vec2 xy = vec2(xy_uint); + vec3 rgb[CHUNK]; + float mask[CHUNK]; + uint blend_stack[BLEND_STACK_SIZE][CHUNK]; + uint blend_spill = 0; + uint blend_sp = 0; + Alloc clip_tos = new_alloc(0, 0); + for (uint i = 0; i < CHUNK; i++) { + rgb[i] = vec3(0.5); +#ifdef VULKAN + if (xy_uint.x < 1024 && xy_uint.y < 1024) { + rgb[i] = texture(textures[gl_WorkGroupID.x / 64], vec2(xy_uint.x, xy_uint.y + CHUNK_DY * i) / 1024.0).rgb; + } +#endif + mask[i] = 1.0; + } + + while (true) { + uint tag = Cmd_tag(cmd_alloc, cmd_ref); + if (tag == Cmd_End) { + break; + } + switch (tag) { + case Cmd_Circle: + CmdCircle circle = Cmd_Circle_read(cmd_alloc, cmd_ref); + vec4 fg_rgba = unpacksRGB(circle.rgba_color); + for (uint i = 0; i < CHUNK; i++) { + float dy = float(i * CHUNK_DY); + float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy); + float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); + rgb[i] = mix(rgb[i], fg_rgba.rgb, mask[i] * alpha * fg_rgba.a); + } + break; + case Cmd_Stroke: + // Calculate distance field from all the line segments in this tile. + CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref); + float df[CHUNK]; + for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; + TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); + do { + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref); + vec2 line_vec = seg.vector; + for (uint k = 0; k < CHUNK; k++) { + vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; + dpos.y += float(k * CHUNK_DY); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k] = min(df[k], length(line_vec * t - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0); + fg_rgba = unpacksRGB(stroke.rgba_color); + for (uint k = 0; k < CHUNK; k++) { + float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0); + rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * alpha * fg_rgba.a); + } + break; + case Cmd_Fill: + CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref); + float area[CHUNK]; + area = computeArea(xy, fill.backdrop, fill.tile_ref); + fg_rgba = unpacksRGB(fill.rgba_color); + for (uint k = 0; k < CHUNK; k++) { + rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * area[k] * fg_rgba.a); + } + break; + case Cmd_FillTexture: + CmdFillTexture fill_tex = Cmd_FillTexture_read(cmd_alloc, cmd_ref); + area = computeArea(xy, fill_tex.backdrop, fill_tex.tile_ref); + vec4 rgba[CHUNK] = fillTexture(xy, CmdSolidTexture(fill_tex.mat, fill_tex.translate, fill_tex.uv_bounds)); + for (uint k = 0; k < CHUNK; k++) { + rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * area[k] * rgba[k].a); + } + break; + case Cmd_BeginClip: + case Cmd_BeginSolidClip: + uint blend_slot = blend_sp % BLEND_STACK_SIZE; + if (blend_sp == blend_spill + BLEND_STACK_SIZE) { + // spill to scratch buffer + MallocResult m = alloc_clip_buf(clip_tos.offset); + if (m.failed) { + return; + } + clip_tos = m.alloc; + uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + for (uint k = 0; k < CHUNK; k++) { + write_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY, blend_stack[blend_slot][k]); + } + blend_spill++; + } + if (tag == Cmd_BeginClip) { + CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_alloc, cmd_ref); + area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref); + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0))); + } + } else { + CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_alloc, cmd_ref); + float solid_alpha = begin_solid_clip.alpha; + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], solid_alpha)); + } + } + blend_sp++; + break; + case Cmd_EndClip: + CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref); + blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE; + if (blend_sp == blend_spill) { + uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + for (uint k = 0; k < CHUNK; k++) { + blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY); + } + clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET); + blend_spill--; + } + blend_sp--; + for (uint k = 0; k < CHUNK; k++) { + vec4 rgba = unpacksRGB(blend_stack[blend_slot][k]); + rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a); + } + break; + case Cmd_Solid: + CmdSolid solid = Cmd_Solid_read(cmd_alloc, cmd_ref); + fg_rgba = unpacksRGB(solid.rgba_color); + for (uint k = 0; k < CHUNK; k++) { + rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a); + } + break; + case Cmd_SolidTexture: + CmdSolidTexture solid_tex = Cmd_SolidTexture_read(cmd_alloc, cmd_ref); + rgba = fillTexture(xy, solid_tex); + for (uint k = 0; k < CHUNK; k++) { + rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * rgba[k].a); + } + break; + case Cmd_SolidMask: + CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_alloc, cmd_ref); + for (uint k = 0; k < CHUNK; k++) { + mask[k] = solid_mask.mask; + } + break; + case Cmd_Jump: + cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref); + cmd_alloc.offset = cmd_ref.offset; + continue; + } + cmd_ref.offset += Cmd_size; + } + + for (uint i = 0; i < CHUNK; i++) { + imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(tosRGB(rgb[i]), 1.0)); + } +} diff --git a/gpu/shaders/mem.h b/gpu/shaders/mem.h new file mode 100644 index 00000000..bc851906 --- /dev/null +++ b/gpu/shaders/mem.h @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +layout(set = 0, binding = 0) buffer Memory { + // offset into memory of the next allocation, initialized by the user. + uint mem_offset; + // mem_error tracks the status of memory accesses, initialized to NO_ERROR + // by the user. ERR_MALLOC_FAILED is reported for insufficient memory. + // If MEM_DEBUG is defined the following errors are reported: + // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes. + // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words. + uint mem_error; + uint[] memory; +}; + +// Uncomment this line to add the size field to Alloc and enable memory checks. +// Note that the Config struct in setup.h grows size fields as well. +//#define MEM_DEBUG + +#define NO_ERROR 0 +#define ERR_MALLOC_FAILED 1 +#define ERR_OUT_OF_BOUNDS 2 +#define ERR_UNALIGNED_ACCESS 3 + +#define Alloc_size 8 + +// Alloc represents a memory allocation. +struct Alloc { + // offset in bytes into memory. + uint offset; +#ifdef MEM_DEBUG + // size in bytes of the allocation. + uint size; +#endif +}; + +struct MallocResult { + Alloc alloc; + // failed is true if the allocation overflowed memory. + bool failed; +}; + +// new_alloc synthesizes an Alloc when its offset and size is derived. +Alloc new_alloc(uint offset, uint size) { + Alloc a; + a.offset = offset; +#ifdef MEM_DEBUG + a.size = size; +#endif + return a; +} + +// malloc allocates size bytes of memory. +MallocResult malloc(uint size) { + MallocResult r; + r.failed = false; + uint offset = atomicAdd(mem_offset, size); + r.alloc = new_alloc(offset, size); + if (offset + size > memory.length() * 4) { + r.failed = true; + atomicMax(mem_error, ERR_MALLOC_FAILED); + return r; + } +#ifdef MEM_DEBUG + if ((size & 3) != 0) { + r.failed = true; + atomicMax(mem_error, ERR_UNALIGNED_ACCESS); + return r; + } +#endif + return r; +} + +// touch_mem checks whether access to the memory word at offset is valid. +// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds. +// Note that offset is in words. +bool touch_mem(Alloc alloc, uint offset) { +#ifdef MEM_DEBUG + if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) { + atomicMax(mem_error, ERR_OUT_OF_BOUNDS); + return false; + } +#endif + return true; +} + +// write_mem writes val to memory at offset. +// Note that offset is in words. +void write_mem(Alloc alloc, uint offset, uint val) { + if (!touch_mem(alloc, offset)) { + return; + } + memory[offset] = val; +} + +// read_mem reads the value from memory at offset. +// Note that offset is in words. +uint read_mem(Alloc alloc, uint offset) { + if (!touch_mem(alloc, offset)) { + return 0; + } + uint v = memory[offset]; + return v; +} + +// slice_mem returns a sub-allocation inside another. Note that offset and size +// are in bytes, relative to a.offset. +Alloc slice_mem(Alloc a, uint offset, uint size) { +#ifdef MEM_DEBUG + if ((offset & 3) != 0 || (size & 3) != 0) { + atomicMax(mem_error, ERR_UNALIGNED_ACCESS); + return Alloc(0, 0); + } + if (offset + size > a.size) { + // slice_mem is sometimes used for slices outside bounds, + // but never written. + return Alloc(0, 0); + } +#endif + return new_alloc(a.offset + offset, size); +} diff --git a/gpu/shaders/path_coarse.comp b/gpu/shaders/path_coarse.comp new file mode 100644 index 00000000..4f77ff9b --- /dev/null +++ b/gpu/shaders/path_coarse.comp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Coarse rasterization of path segments. + +// Allocation and initialization of tiles for paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define LG_COARSE_WG 5 +#define COARSE_WG (1 << LG_COARSE_WG) + +layout(local_size_x = COARSE_WG, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "pathseg.h" +#include "tile.h" + +// scale factors useful for converting coordinates to tiles +#define SX (1.0 / float(TILE_WIDTH_PX)) +#define SY (1.0 / float(TILE_HEIGHT_PX)) + +#define ACCURACY 0.25 +#define Q_ACCURACY (ACCURACY * 0.1) +#define REM_ACCURACY (ACCURACY - Q_ACCURACY) +#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY) + +vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) { + float mt = 1.0 - t; + return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t; +} + +vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) { + float mt = 1.0 - t; + return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t; +} + +struct SubdivResult { + float val; + float a0; + float a2; +}; + +/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$ +/// +/// This is used for flattening curves. +#define D 0.67 +float approx_parabola_integral(float x) { + return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x))); +} + +/// An approximation to the inverse parabola integral. +#define B 0.39 +float approx_parabola_inv_integral(float x) { + return x * sqrt(1.0 - B + (B * B + 0.25 * x * x)); +} + +SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) { + vec2 d01 = p1 - p0; + vec2 d12 = p2 - p1; + vec2 dd = d01 - d12; + float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x; + float x0 = (d01.x * dd.x + d01.y * dd.y) / cross; + float x2 = (d12.x * dd.x + d12.y * dd.y) / cross; + float scale = abs(cross / (length(dd) * (x2 - x0))); + + float a0 = approx_parabola_integral(x0); + float a2 = approx_parabola_integral(x2); + float val = 0.0; + if (scale < 1e9) { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) { + val = da * sqrt_scale; + } else { + float xmin = sqrt_tol / sqrt_scale; + val = sqrt_tol * da / approx_parabola_integral(xmin); + } + } + return SubdivResult(val, a0, a2); +} + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size); + + uint tag = PathSeg_Nop; + if (element_ix < conf.n_pathseg) { + tag = PathSeg_tag(conf.pathseg_alloc, ref); + } + switch (tag) { + case PathSeg_FillCubic: + case PathSeg_StrokeCubic: + PathStrokeCubic cubic = PathSeg_StrokeCubic_read(conf.pathseg_alloc, ref); + vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3; + float err = err_v.x * err_v.x + err_v.y * err_v.y; + // The number of quadratics. + uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1); + // Iterate over quadratics and tote up the estimated number of segments. + float val = 0.0; + vec2 qp0 = cubic.p0; + float step = 1.0 / float(n_quads); + for (uint i = 0; i < n_quads; i++) { + float t = float(i + 1) * step; + vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t); + vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); + val += params.val; + + qp0 = qp2; + } + uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1); + + uint path_ix = cubic.path_ix; + Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); + Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size); + ivec4 bbox = ivec4(path.bbox); + vec2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0; + for (uint i = 0; i < n_quads; i++) { + float t = float(i + 1) * step; + vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t); + vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step); + qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2); + SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY)); + float u0 = approx_parabola_inv_integral(params.a0); + float u2 = approx_parabola_inv_integral(params.a2); + float uscale = 1.0 / (u2 - u0); + float target = float(n_out) * v_step; + while (n_out == n || target < val_sum + params.val) { + vec2 p1; + if (n_out == n) { + p1 = cubic.p3; + } else { + float u = (target - val_sum) / params.val; + float a = mix(params.a0, params.a2, u); + float au = approx_parabola_inv_integral(a); + float t = (au - u0) * uscale; + p1 = eval_quad(qp0, qp1, qp2, t); + } + + // Output line segment + + // Bounding box of element in pixel coordinates. + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + // Set up for per-scanline coverage formula, below. + float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; + float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX; + float b = invslope; // Note: assumes square tiles, otherwise scale. + float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX; + + int x0 = int(floor(xmin * SX)); + int x1 = int(floor(xmax * SX) + 1); + int y0 = int(floor(ymin * SY)); + int y1 = int(floor(ymax * SY) + 1); + + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a + b * float(y0); + int stride = bbox.z - bbox.x; + int base = (y0 - bbox.y) * stride - bbox.x; + // TODO: can be tighter, use c to bound width + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + // Consider using subgroups to aggregate atomic add. + MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size); + if (tile_alloc.failed) { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + + TileSeg tile_seg; + + int xray = int(floor(p0.x*SX)); + int last_xray = int(floor(p1.x*SX)); + if (p0.y > p1.y) { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) { + float tile_y0 = float(y * TILE_HEIGHT_PX); + int xbackdrop = max(xray + 1, bbox.x); + if (tag == PathSeg_FillCubic && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) { + int backdrop = p1.y < p0.y ? 1 : -1; + TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); + uint tile_el = tile_ref.offset >> 2; + if (touch_mem(path_alloc, tile_el + 1)) { + atomicAdd(memory[tile_el + 1], backdrop); + } + } + + // next_xray is the xray for the next scanline; the line segment intersects + // all tiles between xray and next_xray. + int next_xray = last_xray; + if (y < y1 - 1) { + float tile_y1 = float((y + 1) * TILE_HEIGHT_PX); + float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge*SX)); + } + + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), max_xray + 1); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + + for (int x = xx0; x < xx1; x++) { + float tile_x0 = float(x * TILE_WIDTH_PX); + TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x)); + uint tile_el = tile_ref.offset >> 2; + uint old = 0; + if (touch_mem(path_alloc, tile_el)) { + old = atomicExchange(memory[tile_el], tile_offset); + } + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; + float y_edge = 0.0; + if (tag == PathSeg_FillCubic) { + y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (min(p0.x, p1.x) < tile_x0) { + vec2 p = vec2(tile_x0, y_edge); + if (p0.x > p1.x) { + tile_seg.vector = p - p0; + } else { + tile_seg.origin = p; + tile_seg.vector = p1 - p; + } + // kernel4 uses sign(vector.x) for the sign of the intersection backdrop. + // Nudge zeroes towards the intended sign. + if (tile_seg.vector.x == 0) { + tile_seg.vector.x = sign(p1.x - p0.x)*1e-9; + } + } + if (x <= min_xray || max_xray < x) { + // Reject inconsistent intersections. + y_edge = 1e9; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg); + tile_offset += TileSeg_size; + } + xc += b; + base += stride; + xray = next_xray; + } + + n_out += 1; + target += v_step; + p0 = p1; + } + val_sum += params.val; + + qp0 = qp2; + } + + break; + } +} diff --git a/gpu/shaders/pathseg.h b/gpu/shaders/pathseg.h new file mode 100644 index 00000000..00509fbf --- /dev/null +++ b/gpu/shaders/pathseg.h @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct PathFillLineRef { + uint offset; +}; + +struct PathStrokeLineRef { + uint offset; +}; + +struct PathFillCubicRef { + uint offset; +}; + +struct PathStrokeCubicRef { + uint offset; +}; + +struct PathSegRef { + uint offset; +}; + +struct PathFillLine { + vec2 p0; + vec2 p1; + uint path_ix; +}; + +#define PathFillLine_size 20 + +PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) { + return PathFillLineRef(ref.offset + index * PathFillLine_size); +} + +struct PathStrokeLine { + vec2 p0; + vec2 p1; + uint path_ix; + vec2 stroke; +}; + +#define PathStrokeLine_size 28 + +PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) { + return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size); +} + +struct PathFillCubic { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; + uint path_ix; +}; + +#define PathFillCubic_size 36 + +PathFillCubicRef PathFillCubic_index(PathFillCubicRef ref, uint index) { + return PathFillCubicRef(ref.offset + index * PathFillCubic_size); +} + +struct PathStrokeCubic { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; + uint path_ix; + vec2 stroke; +}; + +#define PathStrokeCubic_size 44 + +PathStrokeCubicRef PathStrokeCubic_index(PathStrokeCubicRef ref, uint index) { + return PathStrokeCubicRef(ref.offset + index * PathStrokeCubic_size); +} + +#define PathSeg_Nop 0 +#define PathSeg_FillLine 1 +#define PathSeg_StrokeLine 2 +#define PathSeg_FillCubic 3 +#define PathSeg_StrokeCubic 4 +#define PathSeg_size 48 + +PathSegRef PathSeg_index(PathSegRef ref, uint index) { + return PathSegRef(ref.offset + index * PathSeg_size); +} + +PathFillLine PathFillLine_read(Alloc a, PathFillLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + PathFillLine s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.path_ix = raw4; + return s; +} + +void PathFillLine_write(Alloc a, PathFillLineRef ref, PathFillLine s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, s.path_ix); +} + +PathStrokeLine PathStrokeLine_read(Alloc a, PathStrokeLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + PathStrokeLine s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.path_ix = raw4; + s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6)); + return s; +} + +void PathStrokeLine_write(Alloc a, PathStrokeLineRef ref, PathStrokeLine s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, s.path_ix); + write_mem(a, ix + 5, floatBitsToUint(s.stroke.x)); + write_mem(a, ix + 6, floatBitsToUint(s.stroke.y)); +} + +PathFillCubic PathFillCubic_read(Alloc a, PathFillCubicRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); + PathFillCubic s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + s.path_ix = raw8; + return s; +} + +void PathFillCubic_write(Alloc a, PathFillCubicRef ref, PathFillCubic s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, floatBitsToUint(s.p2.x)); + write_mem(a, ix + 5, floatBitsToUint(s.p2.y)); + write_mem(a, ix + 6, floatBitsToUint(s.p3.x)); + write_mem(a, ix + 7, floatBitsToUint(s.p3.y)); + write_mem(a, ix + 8, s.path_ix); +} + +PathStrokeCubic PathStrokeCubic_read(Alloc a, PathStrokeCubicRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); + uint raw9 = read_mem(a, ix + 9); + uint raw10 = read_mem(a, ix + 10); + PathStrokeCubic s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + s.path_ix = raw8; + s.stroke = vec2(uintBitsToFloat(raw9), uintBitsToFloat(raw10)); + return s; +} + +void PathStrokeCubic_write(Alloc a, PathStrokeCubicRef ref, PathStrokeCubic s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.p0.x)); + write_mem(a, ix + 1, floatBitsToUint(s.p0.y)); + write_mem(a, ix + 2, floatBitsToUint(s.p1.x)); + write_mem(a, ix + 3, floatBitsToUint(s.p1.y)); + write_mem(a, ix + 4, floatBitsToUint(s.p2.x)); + write_mem(a, ix + 5, floatBitsToUint(s.p2.y)); + write_mem(a, ix + 6, floatBitsToUint(s.p3.x)); + write_mem(a, ix + 7, floatBitsToUint(s.p3.y)); + write_mem(a, ix + 8, s.path_ix); + write_mem(a, ix + 9, floatBitsToUint(s.stroke.x)); + write_mem(a, ix + 10, floatBitsToUint(s.stroke.y)); +} + +uint PathSeg_tag(Alloc a, PathSegRef ref) { + return read_mem(a, ref.offset >> 2); +} + +PathFillLine PathSeg_FillLine_read(Alloc a, PathSegRef ref) { + return PathFillLine_read(a, PathFillLineRef(ref.offset + 4)); +} + +PathStrokeLine PathSeg_StrokeLine_read(Alloc a, PathSegRef ref) { + return PathStrokeLine_read(a, PathStrokeLineRef(ref.offset + 4)); +} + +PathFillCubic PathSeg_FillCubic_read(Alloc a, PathSegRef ref) { + return PathFillCubic_read(a, PathFillCubicRef(ref.offset + 4)); +} + +PathStrokeCubic PathSeg_StrokeCubic_read(Alloc a, PathSegRef ref) { + return PathStrokeCubic_read(a, PathStrokeCubicRef(ref.offset + 4)); +} + +void PathSeg_Nop_write(Alloc a, PathSegRef ref) { + write_mem(a, ref.offset >> 2, PathSeg_Nop); +} + +void PathSeg_FillLine_write(Alloc a, PathSegRef ref, PathFillLine s) { + write_mem(a, ref.offset >> 2, PathSeg_FillLine); + PathFillLine_write(a, PathFillLineRef(ref.offset + 4), s); +} + +void PathSeg_StrokeLine_write(Alloc a, PathSegRef ref, PathStrokeLine s) { + write_mem(a, ref.offset >> 2, PathSeg_StrokeLine); + PathStrokeLine_write(a, PathStrokeLineRef(ref.offset + 4), s); +} + +void PathSeg_FillCubic_write(Alloc a, PathSegRef ref, PathFillCubic s) { + write_mem(a, ref.offset >> 2, PathSeg_FillCubic); + PathFillCubic_write(a, PathFillCubicRef(ref.offset + 4), s); +} + +void PathSeg_StrokeCubic_write(Alloc a, PathSegRef ref, PathStrokeCubic s) { + write_mem(a, ref.offset >> 2, PathSeg_StrokeCubic); + PathStrokeCubic_write(a, PathStrokeCubicRef(ref.offset + 4), s); +} + diff --git a/gpu/shaders/ptcl.h b/gpu/shaders/ptcl.h new file mode 100644 index 00000000..28a6d0ad --- /dev/null +++ b/gpu/shaders/ptcl.h @@ -0,0 +1,549 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct CmdCircleRef { + uint offset; +}; + +struct CmdLineRef { + uint offset; +}; + +struct CmdStrokeRef { + uint offset; +}; + +struct CmdFillRef { + uint offset; +}; + +struct CmdFillTextureRef { + uint offset; +}; + +struct CmdBeginClipRef { + uint offset; +}; + +struct CmdBeginSolidClipRef { + uint offset; +}; + +struct CmdEndClipRef { + uint offset; +}; + +struct CmdSolidRef { + uint offset; +}; + +struct CmdSolidTextureRef { + uint offset; +}; + +struct CmdSolidMaskRef { + uint offset; +}; + +struct CmdJumpRef { + uint offset; +}; + +struct CmdRef { + uint offset; +}; + +struct CmdCircle { + vec2 center; + float radius; + uint rgba_color; +}; + +#define CmdCircle_size 16 + +CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) { + return CmdCircleRef(ref.offset + index * CmdCircle_size); +} + +struct CmdLine { + vec2 start; + vec2 end; +}; + +#define CmdLine_size 16 + +CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { + return CmdLineRef(ref.offset + index * CmdLine_size); +} + +struct CmdStroke { + uint tile_ref; + float half_width; + uint rgba_color; +}; + +#define CmdStroke_size 12 + +CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { + return CmdStrokeRef(ref.offset + index * CmdStroke_size); +} + +struct CmdFill { + uint tile_ref; + int backdrop; + uint rgba_color; +}; + +#define CmdFill_size 12 + +CmdFillRef CmdFill_index(CmdFillRef ref, uint index) { + return CmdFillRef(ref.offset + index * CmdFill_size); +} + +struct CmdFillTexture { + uint tile_ref; + int backdrop; + vec4 mat; + vec2 translate; + uvec2 uv_bounds; +}; + +#define CmdFillTexture_size 40 + +CmdFillTextureRef CmdFillTexture_index(CmdFillTextureRef ref, uint index) { + return CmdFillTextureRef(ref.offset + index * CmdFillTexture_size); +} + +struct CmdBeginClip { + uint tile_ref; + int backdrop; +}; + +#define CmdBeginClip_size 8 + +CmdBeginClipRef CmdBeginClip_index(CmdBeginClipRef ref, uint index) { + return CmdBeginClipRef(ref.offset + index * CmdBeginClip_size); +} + +struct CmdBeginSolidClip { + float alpha; +}; + +#define CmdBeginSolidClip_size 4 + +CmdBeginSolidClipRef CmdBeginSolidClip_index(CmdBeginSolidClipRef ref, uint index) { + return CmdBeginSolidClipRef(ref.offset + index * CmdBeginSolidClip_size); +} + +struct CmdEndClip { + float alpha; +}; + +#define CmdEndClip_size 4 + +CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) { + return CmdEndClipRef(ref.offset + index * CmdEndClip_size); +} + +struct CmdSolid { + uint rgba_color; +}; + +#define CmdSolid_size 4 + +CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) { + return CmdSolidRef(ref.offset + index * CmdSolid_size); +} + +struct CmdSolidTexture { + vec4 mat; + vec2 translate; + uvec2 uv_bounds; +}; + +#define CmdSolidTexture_size 32 + +CmdSolidTextureRef CmdSolidTexture_index(CmdSolidTextureRef ref, uint index) { + return CmdSolidTextureRef(ref.offset + index * CmdSolidTexture_size); +} + +struct CmdSolidMask { + float mask; +}; + +#define CmdSolidMask_size 4 + +CmdSolidMaskRef CmdSolidMask_index(CmdSolidMaskRef ref, uint index) { + return CmdSolidMaskRef(ref.offset + index * CmdSolidMask_size); +} + +struct CmdJump { + uint new_ref; +}; + +#define CmdJump_size 4 + +CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) { + return CmdJumpRef(ref.offset + index * CmdJump_size); +} + +#define Cmd_End 0 +#define Cmd_Circle 1 +#define Cmd_Line 2 +#define Cmd_Fill 3 +#define Cmd_FillTexture 4 +#define Cmd_BeginClip 5 +#define Cmd_BeginSolidClip 6 +#define Cmd_EndClip 7 +#define Cmd_Stroke 8 +#define Cmd_Solid 9 +#define Cmd_SolidMask 10 +#define Cmd_SolidTexture 11 +#define Cmd_Jump 12 +#define Cmd_size 44 + +CmdRef Cmd_index(CmdRef ref, uint index) { + return CmdRef(ref.offset + index * Cmd_size); +} + +CmdCircle CmdCircle_read(Alloc a, CmdCircleRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + CmdCircle s; + s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.radius = uintBitsToFloat(raw2); + s.rgba_color = raw3; + return s; +} + +void CmdCircle_write(Alloc a, CmdCircleRef ref, CmdCircle s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.center.x)); + write_mem(a, ix + 1, floatBitsToUint(s.center.y)); + write_mem(a, ix + 2, floatBitsToUint(s.radius)); + write_mem(a, ix + 3, s.rgba_color); +} + +CmdLine CmdLine_read(Alloc a, CmdLineRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + CmdLine s; + s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +void CmdLine_write(Alloc a, CmdLineRef ref, CmdLine s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.start.x)); + write_mem(a, ix + 1, floatBitsToUint(s.start.y)); + write_mem(a, ix + 2, floatBitsToUint(s.end.x)); + write_mem(a, ix + 3, floatBitsToUint(s.end.y)); +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = uintBitsToFloat(raw1); + s.rgba_color = raw2; + return s; +} + +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, floatBitsToUint(s.half_width)); + write_mem(a, ix + 2, s.rgba_color); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + s.rgba_color = raw2; + return s; +} + +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, uint(s.backdrop)); + write_mem(a, ix + 2, s.rgba_color); +} + +CmdFillTexture CmdFillTexture_read(Alloc a, CmdFillTextureRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + uint raw8 = read_mem(a, ix + 8); + uint raw9 = read_mem(a, ix + 9); + CmdFillTexture s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + s.mat = vec4(uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.translate = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + s.uv_bounds = uvec2(raw8, raw9); + return s; +} + +void CmdFillTexture_write(Alloc a, CmdFillTextureRef ref, CmdFillTexture s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, uint(s.backdrop)); + write_mem(a, ix + 2, floatBitsToUint(s.mat.x)); + write_mem(a, ix + 3, floatBitsToUint(s.mat.y)); + write_mem(a, ix + 4, floatBitsToUint(s.mat.z)); + write_mem(a, ix + 5, floatBitsToUint(s.mat.w)); + write_mem(a, ix + 6, floatBitsToUint(s.translate.x)); + write_mem(a, ix + 7, floatBitsToUint(s.translate.y)); + write_mem(a, ix + 8, s.uv_bounds.x); + write_mem(a, ix + 9, s.uv_bounds.y); +} + +CmdBeginClip CmdBeginClip_read(Alloc a, CmdBeginClipRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + CmdBeginClip s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +void CmdBeginClip_write(Alloc a, CmdBeginClipRef ref, CmdBeginClip s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.tile_ref); + write_mem(a, ix + 1, uint(s.backdrop)); +} + +CmdBeginSolidClip CmdBeginSolidClip_read(Alloc a, CmdBeginSolidClipRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + CmdBeginSolidClip s; + s.alpha = uintBitsToFloat(raw0); + return s; +} + +void CmdBeginSolidClip_write(Alloc a, CmdBeginSolidClipRef ref, CmdBeginSolidClip s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.alpha)); +} + +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + CmdEndClip s; + s.alpha = uintBitsToFloat(raw0); + return s; +} + +void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.alpha)); +} + +CmdSolid CmdSolid_read(Alloc a, CmdSolidRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + CmdSolid s; + s.rgba_color = raw0; + return s; +} + +void CmdSolid_write(Alloc a, CmdSolidRef ref, CmdSolid s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.rgba_color); +} + +CmdSolidTexture CmdSolidTexture_read(Alloc a, CmdSolidTextureRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + uint raw6 = read_mem(a, ix + 6); + uint raw7 = read_mem(a, ix + 7); + CmdSolidTexture s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.uv_bounds = uvec2(raw6, raw7); + return s; +} + +void CmdSolidTexture_write(Alloc a, CmdSolidTextureRef ref, CmdSolidTexture s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.mat.x)); + write_mem(a, ix + 1, floatBitsToUint(s.mat.y)); + write_mem(a, ix + 2, floatBitsToUint(s.mat.z)); + write_mem(a, ix + 3, floatBitsToUint(s.mat.w)); + write_mem(a, ix + 4, floatBitsToUint(s.translate.x)); + write_mem(a, ix + 5, floatBitsToUint(s.translate.y)); + write_mem(a, ix + 6, s.uv_bounds.x); + write_mem(a, ix + 7, s.uv_bounds.y); +} + +CmdSolidMask CmdSolidMask_read(Alloc a, CmdSolidMaskRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + CmdSolidMask s; + s.mask = uintBitsToFloat(raw0); + return s; +} + +void CmdSolidMask_write(Alloc a, CmdSolidMaskRef ref, CmdSolidMask s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.mask)); +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + CmdJump s; + s.new_ref = raw0; + return s; +} + +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.new_ref); +} + +uint Cmd_tag(Alloc a, CmdRef ref) { + return read_mem(a, ref.offset >> 2); +} + +CmdCircle Cmd_Circle_read(Alloc a, CmdRef ref) { + return CmdCircle_read(a, CmdCircleRef(ref.offset + 4)); +} + +CmdLine Cmd_Line_read(Alloc a, CmdRef ref) { + return CmdLine_read(a, CmdLineRef(ref.offset + 4)); +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) { + return CmdFill_read(a, CmdFillRef(ref.offset + 4)); +} + +CmdFillTexture Cmd_FillTexture_read(Alloc a, CmdRef ref) { + return CmdFillTexture_read(a, CmdFillTextureRef(ref.offset + 4)); +} + +CmdBeginClip Cmd_BeginClip_read(Alloc a, CmdRef ref) { + return CmdBeginClip_read(a, CmdBeginClipRef(ref.offset + 4)); +} + +CmdBeginSolidClip Cmd_BeginSolidClip_read(Alloc a, CmdRef ref) { + return CmdBeginSolidClip_read(a, CmdBeginSolidClipRef(ref.offset + 4)); +} + +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) { + return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4)); +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) { + return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4)); +} + +CmdSolid Cmd_Solid_read(Alloc a, CmdRef ref) { + return CmdSolid_read(a, CmdSolidRef(ref.offset + 4)); +} + +CmdSolidMask Cmd_SolidMask_read(Alloc a, CmdRef ref) { + return CmdSolidMask_read(a, CmdSolidMaskRef(ref.offset + 4)); +} + +CmdSolidTexture Cmd_SolidTexture_read(Alloc a, CmdRef ref) { + return CmdSolidTexture_read(a, CmdSolidTextureRef(ref.offset + 4)); +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) { + return CmdJump_read(a, CmdJumpRef(ref.offset + 4)); +} + +void Cmd_End_write(Alloc a, CmdRef ref) { + write_mem(a, ref.offset >> 2, Cmd_End); +} + +void Cmd_Circle_write(Alloc a, CmdRef ref, CmdCircle s) { + write_mem(a, ref.offset >> 2, Cmd_Circle); + CmdCircle_write(a, CmdCircleRef(ref.offset + 4), s); +} + +void Cmd_Line_write(Alloc a, CmdRef ref, CmdLine s) { + write_mem(a, ref.offset >> 2, Cmd_Line); + CmdLine_write(a, CmdLineRef(ref.offset + 4), s); +} + +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) { + write_mem(a, ref.offset >> 2, Cmd_Fill); + CmdFill_write(a, CmdFillRef(ref.offset + 4), s); +} + +void Cmd_FillTexture_write(Alloc a, CmdRef ref, CmdFillTexture s) { + write_mem(a, ref.offset >> 2, Cmd_FillTexture); + CmdFillTexture_write(a, CmdFillTextureRef(ref.offset + 4), s); +} + +void Cmd_BeginClip_write(Alloc a, CmdRef ref, CmdBeginClip s) { + write_mem(a, ref.offset >> 2, Cmd_BeginClip); + CmdBeginClip_write(a, CmdBeginClipRef(ref.offset + 4), s); +} + +void Cmd_BeginSolidClip_write(Alloc a, CmdRef ref, CmdBeginSolidClip s) { + write_mem(a, ref.offset >> 2, Cmd_BeginSolidClip); + CmdBeginSolidClip_write(a, CmdBeginSolidClipRef(ref.offset + 4), s); +} + +void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) { + write_mem(a, ref.offset >> 2, Cmd_EndClip); + CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s); +} + +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) { + write_mem(a, ref.offset >> 2, Cmd_Stroke); + CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s); +} + +void Cmd_Solid_write(Alloc a, CmdRef ref, CmdSolid s) { + write_mem(a, ref.offset >> 2, Cmd_Solid); + CmdSolid_write(a, CmdSolidRef(ref.offset + 4), s); +} + +void Cmd_SolidMask_write(Alloc a, CmdRef ref, CmdSolidMask s) { + write_mem(a, ref.offset >> 2, Cmd_SolidMask); + CmdSolidMask_write(a, CmdSolidMaskRef(ref.offset + 4), s); +} + +void Cmd_SolidTexture_write(Alloc a, CmdRef ref, CmdSolidTexture s) { + write_mem(a, ref.offset >> 2, Cmd_SolidTexture); + CmdSolidTexture_write(a, CmdSolidTextureRef(ref.offset + 4), s); +} + +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) { + write_mem(a, ref.offset >> 2, Cmd_Jump); + CmdJump_write(a, CmdJumpRef(ref.offset + 4), s); +} + diff --git a/gpu/shaders/scene.h b/gpu/shaders/scene.h new file mode 100644 index 00000000..2ecb6e5c --- /dev/null +++ b/gpu/shaders/scene.h @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct LineSegRef { + uint offset; +}; + +struct QuadSegRef { + uint offset; +}; + +struct CubicSegRef { + uint offset; +}; + +struct FillRef { + uint offset; +}; + +struct FillTextureRef { + uint offset; +}; + +struct StrokeRef { + uint offset; +}; + +struct SetLineWidthRef { + uint offset; +}; + +struct TransformRef { + uint offset; +}; + +struct ClipRef { + uint offset; +}; + +struct ElementRef { + uint offset; +}; + +struct LineSeg { + vec2 p0; + vec2 p1; +}; + +#define LineSeg_size 16 + +LineSegRef LineSeg_index(LineSegRef ref, uint index) { + return LineSegRef(ref.offset + index * LineSeg_size); +} + +struct QuadSeg { + vec2 p0; + vec2 p1; + vec2 p2; +}; + +#define QuadSeg_size 24 + +QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) { + return QuadSegRef(ref.offset + index * QuadSeg_size); +} + +struct CubicSeg { + vec2 p0; + vec2 p1; + vec2 p2; + vec2 p3; +}; + +#define CubicSeg_size 32 + +CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) { + return CubicSegRef(ref.offset + index * CubicSeg_size); +} + +struct Fill { + uint rgba_color; +}; + +#define Fill_size 4 + +FillRef Fill_index(FillRef ref, uint index) { + return FillRef(ref.offset + index * Fill_size); +} + +struct FillTexture { + uvec2 uv_bounds; +}; + +#define FillTexture_size 8 + +FillTextureRef FillTexture_index(FillTextureRef ref, uint index) { + return FillTextureRef(ref.offset + index * FillTexture_size); +} + +struct Stroke { + uint rgba_color; +}; + +#define Stroke_size 4 + +StrokeRef Stroke_index(StrokeRef ref, uint index) { + return StrokeRef(ref.offset + index * Stroke_size); +} + +struct SetLineWidth { + float width; +}; + +#define SetLineWidth_size 4 + +SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) { + return SetLineWidthRef(ref.offset + index * SetLineWidth_size); +} + +struct Transform { + vec4 mat; + vec2 translate; +}; + +#define Transform_size 24 + +TransformRef Transform_index(TransformRef ref, uint index) { + return TransformRef(ref.offset + index * Transform_size); +} + +struct Clip { + vec4 bbox; +}; + +#define Clip_size 16 + +ClipRef Clip_index(ClipRef ref, uint index) { + return ClipRef(ref.offset + index * Clip_size); +} + +#define Element_Nop 0 +#define Element_StrokeLine 1 +#define Element_FillLine 2 +#define Element_StrokeQuad 3 +#define Element_FillQuad 4 +#define Element_StrokeCubic 5 +#define Element_FillCubic 6 +#define Element_Stroke 7 +#define Element_Fill 8 +#define Element_SetLineWidth 9 +#define Element_Transform 10 +#define Element_BeginClip 11 +#define Element_EndClip 12 +#define Element_FillTexture 13 +#define Element_size 36 + +ElementRef Element_index(ElementRef ref, uint index) { + return ElementRef(ref.offset + index * Element_size); +} + +LineSeg LineSeg_read(LineSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + LineSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +QuadSeg QuadSeg_read(QuadSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + QuadSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +CubicSeg CubicSeg_read(CubicSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + uint raw6 = scene[ix + 6]; + uint raw7 = scene[ix + 7]; + CubicSeg s; + s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7)); + return s; +} + +Fill Fill_read(FillRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Fill s; + s.rgba_color = raw0; + return s; +} + +FillTexture FillTexture_read(FillTextureRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + FillTexture s; + s.uv_bounds = uvec2(raw0, raw1); + return s; +} + +Stroke Stroke_read(StrokeRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + Stroke s; + s.rgba_color = raw0; + return s; +} + +SetLineWidth SetLineWidth_read(SetLineWidthRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + SetLineWidth s; + s.width = uintBitsToFloat(raw0); + return s; +} + +Transform Transform_read(TransformRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + uint raw4 = scene[ix + 4]; + uint raw5 = scene[ix + 5]; + Transform s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + return s; +} + +Clip Clip_read(ClipRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = scene[ix + 0]; + uint raw1 = scene[ix + 1]; + uint raw2 = scene[ix + 2]; + uint raw3 = scene[ix + 3]; + Clip s; + s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + return s; +} + +uint Element_tag(ElementRef ref) { + return scene[ref.offset >> 2]; +} + +LineSeg Element_StrokeLine_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +LineSeg Element_FillLine_read(ElementRef ref) { + return LineSeg_read(LineSegRef(ref.offset + 4)); +} + +QuadSeg Element_StrokeQuad_read(ElementRef ref) { + return QuadSeg_read(QuadSegRef(ref.offset + 4)); +} + +QuadSeg Element_FillQuad_read(ElementRef ref) { + return QuadSeg_read(QuadSegRef(ref.offset + 4)); +} + +CubicSeg Element_StrokeCubic_read(ElementRef ref) { + return CubicSeg_read(CubicSegRef(ref.offset + 4)); +} + +CubicSeg Element_FillCubic_read(ElementRef ref) { + return CubicSeg_read(CubicSegRef(ref.offset + 4)); +} + +Stroke Element_Stroke_read(ElementRef ref) { + return Stroke_read(StrokeRef(ref.offset + 4)); +} + +Fill Element_Fill_read(ElementRef ref) { + return Fill_read(FillRef(ref.offset + 4)); +} + +SetLineWidth Element_SetLineWidth_read(ElementRef ref) { + return SetLineWidth_read(SetLineWidthRef(ref.offset + 4)); +} + +Transform Element_Transform_read(ElementRef ref) { + return Transform_read(TransformRef(ref.offset + 4)); +} + +Clip Element_BeginClip_read(ElementRef ref) { + return Clip_read(ClipRef(ref.offset + 4)); +} + +Clip Element_EndClip_read(ElementRef ref) { + return Clip_read(ClipRef(ref.offset + 4)); +} + +FillTexture Element_FillTexture_read(ElementRef ref) { + return FillTexture_read(FillTextureRef(ref.offset + 4)); +} + diff --git a/gpu/shaders/setup.h b/gpu/shaders/setup.h new file mode 100644 index 00000000..dc32c40b --- /dev/null +++ b/gpu/shaders/setup.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Various constants for the sizes of groups and tiles. + +// Much of this will be made dynamic in various ways, but for now it's easiest +// to hardcode and keep all in one place. + +// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a +// maximum workgroup size of 128, or 1 for a maximum size of 256. +#define LG_WG_FACTOR 0 +#define WG_FACTOR (1<> 2; + uint raw0 = state[ix + 0]; + uint raw1 = state[ix + 1]; + uint raw2 = state[ix + 2]; + uint raw3 = state[ix + 3]; + uint raw4 = state[ix + 4]; + uint raw5 = state[ix + 5]; + uint raw6 = state[ix + 6]; + uint raw7 = state[ix + 7]; + uint raw8 = state[ix + 8]; + uint raw9 = state[ix + 9]; + uint raw10 = state[ix + 10]; + uint raw11 = state[ix + 11]; + uint raw12 = state[ix + 12]; + uint raw13 = state[ix + 13]; + State s; + s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); + s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); + s.linewidth = uintBitsToFloat(raw10); + s.flags = raw11; + s.path_count = raw12; + s.pathseg_count = raw13; + return s; +} + +void State_write(StateRef ref, State s) { + uint ix = ref.offset >> 2; + state[ix + 0] = floatBitsToUint(s.mat.x); + state[ix + 1] = floatBitsToUint(s.mat.y); + state[ix + 2] = floatBitsToUint(s.mat.z); + state[ix + 3] = floatBitsToUint(s.mat.w); + state[ix + 4] = floatBitsToUint(s.translate.x); + state[ix + 5] = floatBitsToUint(s.translate.y); + state[ix + 6] = floatBitsToUint(s.bbox.x); + state[ix + 7] = floatBitsToUint(s.bbox.y); + state[ix + 8] = floatBitsToUint(s.bbox.z); + state[ix + 9] = floatBitsToUint(s.bbox.w); + state[ix + 10] = floatBitsToUint(s.linewidth); + state[ix + 11] = s.flags; + state[ix + 12] = s.path_count; + state[ix + 13] = s.pathseg_count; +} + diff --git a/gpu/shaders/tile.h b/gpu/shaders/tile.h new file mode 100644 index 00000000..500277be --- /dev/null +++ b/gpu/shaders/tile.h @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Code auto-generated by piet-gpu-derive + +struct PathRef { + uint offset; +}; + +struct TileRef { + uint offset; +}; + +struct TileSegRef { + uint offset; +}; + +struct Path { + uvec4 bbox; + TileRef tiles; +}; + +#define Path_size 12 + +PathRef Path_index(PathRef ref, uint index) { + return PathRef(ref.offset + index * Path_size); +} + +struct Tile { + TileSegRef tile; + int backdrop; +}; + +#define Tile_size 8 + +TileRef Tile_index(TileRef ref, uint index) { + return TileRef(ref.offset + index * Tile_size); +} + +struct TileSeg { + vec2 origin; + vec2 vector; + float y_edge; + TileSegRef next; +}; + +#define TileSeg_size 24 + +TileSegRef TileSeg_index(TileSegRef ref, uint index) { + return TileSegRef(ref.offset + index * TileSeg_size); +} + +Path Path_read(Alloc a, PathRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + Path s; + s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16); + s.tiles = TileRef(raw2); + return s; +} + +void Path_write(Alloc a, PathRef ref, Path s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16)); + write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16)); + write_mem(a, ix + 2, s.tiles.offset); +} + +Tile Tile_read(Alloc a, TileRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + Tile s; + s.tile = TileSegRef(raw0); + s.backdrop = int(raw1); + return s; +} + +void Tile_write(Alloc a, TileRef ref, Tile s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, s.tile.offset); + write_mem(a, ix + 1, uint(s.backdrop)); +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) { + uint ix = ref.offset >> 2; + uint raw0 = read_mem(a, ix + 0); + uint raw1 = read_mem(a, ix + 1); + uint raw2 = read_mem(a, ix + 2); + uint raw3 = read_mem(a, ix + 3); + uint raw4 = read_mem(a, ix + 4); + uint raw5 = read_mem(a, ix + 5); + TileSeg s; + s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); + s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); + s.y_edge = uintBitsToFloat(raw4); + s.next = TileSegRef(raw5); + return s; +} + +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) { + uint ix = ref.offset >> 2; + write_mem(a, ix + 0, floatBitsToUint(s.origin.x)); + write_mem(a, ix + 1, floatBitsToUint(s.origin.y)); + write_mem(a, ix + 2, floatBitsToUint(s.vector.x)); + write_mem(a, ix + 3, floatBitsToUint(s.vector.y)); + write_mem(a, ix + 4, floatBitsToUint(s.y_edge)); + write_mem(a, ix + 5, s.next.offset); +} + diff --git a/gpu/shaders/tile_alloc.comp b/gpu/shaders/tile_alloc.comp new file mode 100644 index 00000000..ac72fb35 --- /dev/null +++ b/gpu/shaders/tile_alloc.comp @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// Allocation and initialization of tiles for paths. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR) +#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG) + +layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; + +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" +#include "tile.h" + +// scale factors useful for converting coordinates to tiles +#define SX (1.0 / float(TILE_WIDTH_PX)) +#define SY (1.0 / float(TILE_HEIGHT_PX)) + +shared uint sh_tile_count[TILE_ALLOC_WG]; +shared MallocResult sh_tile_alloc; + +void main() { + if (mem_error != NO_ERROR) { + return; + } + + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); + + uint tag = Annotated_Nop; + if (element_ix < conf.n_elements) { + tag = Annotated_tag(conf.anno_alloc, ref); + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + switch (tag) { + case Annotated_Fill: + case Annotated_FillTexture: + case Annotated_Stroke: + case Annotated_BeginClip: + case Annotated_EndClip: + // Note: we take advantage of the fact that fills, strokes, and + // clips have compatible layout. + AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + break; + } + x0 = clamp(x0, 0, int(conf.width_in_tiles)); + y0 = clamp(y0, 0, int(conf.height_in_tiles)); + x1 = clamp(x1, 0, int(conf.width_in_tiles)); + y1 = clamp(y1, 0, int(conf.height_in_tiles)); + + Path path; + path.bbox = uvec4(x0, y0, x1, y1); + uint tile_count = (x1 - x0) * (y1 - y0); + if (tag == Annotated_EndClip) { + // Don't actually allocate tiles for an end clip, but we do want + // the path structure (especially bbox) allocated for it. + tile_count = 0; + } + + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + // Prefix sum of sh_tile_count + for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { + barrier(); + if (th_ix >= (1 << i)) { + total_tile_count += sh_tile_count[th_ix - (1 << i)]; + } + barrier(); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == TILE_ALLOC_WG - 1) { + sh_tile_alloc = malloc(total_tile_count * Tile_size); + } + barrier(); + MallocResult alloc_start = sh_tile_alloc; + if (alloc_start.failed) { + return; + } + + if (element_ix < conf.n_elements) { + uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; + Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count); + path.tiles = TileRef(tiles_alloc.offset); + Path_write(conf.tile_alloc, path_ref, path); + } + + // Zero out allocated tiles efficiently + uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); + uint start_ix = alloc_start.alloc.offset >> 2; + for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { + // Note: this interleaving is faster than using Tile_write + // by a significant amount. + write_mem(alloc_start.alloc, start_ix + i, 0); + } +}