diff --git a/gpu/shaders/annotated.h b/gpu/shaders/annotated.h
new file mode 100644
index 00000000..2a88ef35
--- /dev/null
+++ b/gpu/shaders/annotated.h
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct AnnoFillRef {
+    uint offset;
+};
+
+struct AnnoFillTextureRef {
+    uint offset;
+};
+
+struct AnnoStrokeRef {
+    uint offset;
+};
+
+struct AnnoClipRef {
+    uint offset;
+};
+
+struct AnnotatedRef {
+    uint offset;
+};
+
+struct AnnoFill {
+    vec4 bbox;
+    uint rgba_color;
+};
+
+#define AnnoFill_size 20
+
+AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
+    return AnnoFillRef(ref.offset + index * AnnoFill_size);
+}
+
+struct AnnoFillTexture {
+    vec4 bbox;
+    vec4 mat;
+    vec2 translate;
+    uvec2 uv_bounds;
+};
+
+#define AnnoFillTexture_size 48
+
+AnnoFillTextureRef AnnoFillTexture_index(AnnoFillTextureRef ref, uint index) {
+    return AnnoFillTextureRef(ref.offset + index * AnnoFillTexture_size);
+}
+
+struct AnnoStroke {
+    vec4 bbox;
+    uint rgba_color;
+    float linewidth;
+};
+
+#define AnnoStroke_size 24
+
+AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
+    return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
+}
+
+struct AnnoClip {
+    vec4 bbox;
+};
+
+#define AnnoClip_size 16
+
+AnnoClipRef AnnoClip_index(AnnoClipRef ref, uint index) {
+    return AnnoClipRef(ref.offset + index * AnnoClip_size);
+}
+
+#define Annotated_Nop 0
+#define Annotated_Stroke 1
+#define Annotated_Fill 2
+#define Annotated_FillTexture 3
+#define Annotated_BeginClip 4
+#define Annotated_EndClip 5
+#define Annotated_size 52
+
+AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
+    return AnnotatedRef(ref.offset + index * Annotated_size);
+}
+
+AnnoFill AnnoFill_read(Alloc a, AnnoFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    AnnoFill s;
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.rgba_color = raw4;
+    return s;
+}
+
+void AnnoFill_write(Alloc a, AnnoFillRef ref, AnnoFill s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.rgba_color);
+}
+
+AnnoFillTexture AnnoFillTexture_read(Alloc a, AnnoFillTextureRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    uint raw11 = read_mem(a, ix + 11);
+    AnnoFillTexture s;
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.mat = vec4(uintBitsToFloat(raw4), uintBitsToFloat(raw5), uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    s.translate = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9));
+    s.uv_bounds = uvec2(raw10, raw11);
+    return s;
+}
+
+void AnnoFillTexture_write(Alloc a, AnnoFillTextureRef ref, AnnoFillTexture s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 7, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 8, floatBitsToUint(s.translate.x));
+    write_mem(a, ix + 9, floatBitsToUint(s.translate.y));
+    write_mem(a, ix + 10, s.uv_bounds.x);
+    write_mem(a, ix + 11, s.uv_bounds.y);
+}
+
+AnnoStroke AnnoStroke_read(Alloc a, AnnoStrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    AnnoStroke s;
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.rgba_color = raw4;
+    s.linewidth = uintBitsToFloat(raw5);
+    return s;
+}
+
+void AnnoStroke_write(Alloc a, AnnoStrokeRef ref, AnnoStroke s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+    write_mem(a, ix + 4, s.rgba_color);
+    write_mem(a, ix + 5, floatBitsToUint(s.linewidth));
+}
+
+AnnoClip AnnoClip_read(Alloc a, AnnoClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    AnnoClip s;
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void AnnoClip_write(Alloc a, AnnoClipRef ref, AnnoClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
+}
+
+uint Annotated_tag(Alloc a, AnnotatedRef ref) {
+    return read_mem(a, ref.offset >> 2);
+}
+
+AnnoStroke Annotated_Stroke_read(Alloc a, AnnotatedRef ref) {
+    return AnnoStroke_read(a, AnnoStrokeRef(ref.offset + 4));
+}
+
+AnnoFill Annotated_Fill_read(Alloc a, AnnotatedRef ref) {
+    return AnnoFill_read(a, AnnoFillRef(ref.offset + 4));
+}
+
+AnnoFillTexture Annotated_FillTexture_read(Alloc a, AnnotatedRef ref) {
+    return AnnoFillTexture_read(a, AnnoFillTextureRef(ref.offset + 4));
+}
+
+AnnoClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
+}
+
+AnnoClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
+    return AnnoClip_read(a, AnnoClipRef(ref.offset + 4));
+}
+
+void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
+    write_mem(a, ref.offset >> 2, Annotated_Nop);
+}
+
+void Annotated_Stroke_write(Alloc a, AnnotatedRef ref, AnnoStroke s) {
+    write_mem(a, ref.offset >> 2, Annotated_Stroke);
+    AnnoStroke_write(a, AnnoStrokeRef(ref.offset + 4), s);
+}
+
+void Annotated_Fill_write(Alloc a, AnnotatedRef ref, AnnoFill s) {
+    write_mem(a, ref.offset >> 2, Annotated_Fill);
+    AnnoFill_write(a, AnnoFillRef(ref.offset + 4), s);
+}
+
+void Annotated_FillTexture_write(Alloc a, AnnotatedRef ref, AnnoFillTexture s) {
+    write_mem(a, ref.offset >> 2, Annotated_FillTexture);
+    AnnoFillTexture_write(a, AnnoFillTextureRef(ref.offset + 4), s);
+}
+
+void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
+    write_mem(a, ref.offset >> 2, Annotated_BeginClip);
+    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
+}
+
+void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoClip s) {
+    write_mem(a, ref.offset >> 2, Annotated_EndClip);
+    AnnoClip_write(a, AnnoClipRef(ref.offset + 4), s);
+}
+
diff --git a/gpu/shaders/backdrop.comp b/gpu/shaders/backdrop.comp
new file mode 100644
index 00000000..04a99990
--- /dev/null
+++ b/gpu/shaders/backdrop.comp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Propagation of tile backdrop for filling.
+//
+// Each thread reads one path element and calculates the number of spanned tiles
+// based on the bounding box.
+// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
+// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
+// and propagated from the left to the right (prefix summed).
+//
+// Output state:
+//  - Each path element has an array of tiles covering the whole path based on boundig box
+//  - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
+#define BACKDROP_WG (1 << LG_BACKDROP_WG)
+
+layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+#include "tile.h"
+
+shared uint sh_row_count[BACKDROP_WG];
+shared Alloc sh_row_alloc[BACKDROP_WG];
+shared uint sh_row_width[BACKDROP_WG];
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    uint th_ix = gl_LocalInvocationID.x;
+    uint element_ix = gl_GlobalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+
+    // Work assignment: 1 thread : 1 path element
+    uint row_count = 0;
+    if (element_ix < conf.n_elements) {
+        uint tag = Annotated_tag(conf.anno_alloc, ref);
+        switch (tag) {
+        case Annotated_Fill:
+        case Annotated_FillTexture:
+        case Annotated_BeginClip:
+            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+            Path path = Path_read(conf.tile_alloc, path_ref);
+            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+            row_count = path.bbox.w - path.bbox.y;
+            // Paths that don't cross tile top edges don't have backdrops.
+            // Don't apply the optimization to paths that may cross the y = 0
+            // top edge, but clipped to 1 row.
+            if (row_count == 1 && path.bbox.y > 0) {
+                // Note: this can probably be expanded to width = 2 as
+                // long as it doesn't cross the left edge.
+                row_count = 0;
+            }
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            sh_row_alloc[th_ix] = path_alloc;
+        }
+    }
+
+    sh_row_count[th_ix] = row_count;
+    // Prefix sum of sh_row_count
+    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
+        barrier();
+        if (th_ix >= (1 << i)) {
+            row_count += sh_row_count[th_ix - (1 << i)];
+        }
+        barrier();
+        sh_row_count[th_ix] = row_count;
+    }
+    barrier();
+    // Work assignment: 1 thread : 1 path element row
+    uint total_rows = sh_row_count[BACKDROP_WG - 1];
+    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
+        // Binary search to find element
+        uint el_ix = 0;
+        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
+            uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
+            if (row >= sh_row_count[probe - 1]) {
+                el_ix = probe;
+            }
+        }
+        uint width = sh_row_width[el_ix];
+        if (width > 0) {
+            // Process one row sequentially
+            // Read backdrop value per tile and prefix sum it
+            Alloc tiles_alloc = sh_row_alloc[el_ix];
+            uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
+            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
+            uint sum = read_mem(tiles_alloc, tile_el_ix);
+            for (uint x = 1; x < width; x++) {
+                tile_el_ix += 2;
+                sum += read_mem(tiles_alloc, tile_el_ix);
+                write_mem(tiles_alloc, tile_el_ix, sum);
+            }
+        }
+    }
+}
diff --git a/gpu/shaders/binning.comp b/gpu/shaders/binning.comp
new file mode 100644
index 00000000..4c78cd24
--- /dev/null
+++ b/gpu/shaders/binning.comp
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The binning stage of the pipeline.
+//
+// Each workgroup processes N_TILE paths.
+// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
+// based on the path bounding box to bin the paths.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+layout(local_size_x = N_TILE, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+#include "bins.h"
+
+// scale factors useful for converting coordinates to bins
+#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
+#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
+
+// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
+#define INFINITY (1.0 / 0.0)
+
+// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
+// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
+shared uint bitmaps[N_SLICE][N_TILE];
+shared uint count[N_SLICE][N_TILE];
+shared Alloc sh_chunk_alloc[N_TILE];
+shared bool sh_alloc_failed;
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    uint my_n_elements = conf.n_elements;
+    uint my_partition = gl_WorkGroupID.x;
+
+    for (uint i = 0; i < N_SLICE; i++) {
+        bitmaps[i][gl_LocalInvocationID.x] = 0;
+    }
+    if (gl_LocalInvocationID.x == 0) {
+        sh_alloc_failed = false;
+    }
+    barrier();
+
+    // Read inputs and determine coverage of bins
+    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+    uint tag = Annotated_Nop;
+    if (element_ix < my_n_elements) {
+        tag = Annotated_tag(conf.anno_alloc, ref);
+    }
+    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    switch (tag) {
+    case Annotated_Fill:
+    case Annotated_FillTexture:
+    case Annotated_Stroke:
+    case Annotated_BeginClip:
+    case Annotated_EndClip:
+        // Note: we take advantage of the fact that these drawing elements
+        // have the bbox at the same place in their layout.
+        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
+        x0 = int(floor(fill.bbox.x * SX));
+        y0 = int(floor(fill.bbox.y * SY));
+        x1 = int(ceil(fill.bbox.z * SX));
+        y1 = int(ceil(fill.bbox.w * SY));
+        break;
+    }
+
+    // At this point, we run an iterator over the coverage area,
+    // trying to keep divergence low.
+    // Right now, it's just a bbox, but we'll get finer with
+    // segments.
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
+    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
+    x0 = clamp(x0, 0, int(width_in_bins));
+    x1 = clamp(x1, x0, int(width_in_bins));
+    y0 = clamp(y0, 0, int(height_in_bins));
+    y1 = clamp(y1, y0, int(height_in_bins));
+    if (x0 == x1) y1 = y0;
+    int x = x0, y = y0;
+    uint my_slice = gl_LocalInvocationID.x / 32;
+    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
+    while (y < y1) {
+        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
+        x++;
+        if (x == x1) {
+            x = x0;
+            y++;
+        }
+    }
+
+    barrier();
+    // Allocate output segments.
+    uint element_count = 0;
+    for (uint i = 0; i < N_SLICE; i++) {
+        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
+        count[i][gl_LocalInvocationID.x] = element_count;
+    }
+    // element_count is number of elements covering bin for this invocation.
+    Alloc chunk_alloc = new_alloc(0, 0);
+    if (element_count != 0) {
+        // TODO: aggregate atomic adds (subgroup is probably fastest)
+        MallocResult chunk = malloc(element_count * BinInstance_size);
+        chunk_alloc = chunk.alloc;
+        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
+        if (chunk.failed) {
+            sh_alloc_failed = true;
+        }
+    }
+    // Note: it might be more efficient for reading to do this in the
+    // other order (each bin is a contiguous sequence of partitions)
+    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    write_mem(conf.bin_alloc, out_ix, element_count);
+    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
+
+    barrier();
+    if (sh_alloc_failed) {
+        return;
+    }
+
+    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
+    // touched by this element
+    x = x0;
+    y = y0;
+    while (y < y1) {
+        uint bin_ix = y * width_in_bins + x;
+        uint out_mask = bitmaps[my_slice][bin_ix];
+        if ((out_mask & my_mask) != 0) {
+            uint idx = bitCount(out_mask & (my_mask - 1));
+            if (my_slice > 0) {
+                idx += count[my_slice - 1][bin_ix];
+            }
+            Alloc out_alloc = sh_chunk_alloc[bin_ix];
+            uint out_offset = out_alloc.offset + idx * BinInstance_size;
+            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
+        }
+        x++;
+        if (x == x1) {
+            x = x0;
+            y++;
+        }
+    }
+}
diff --git a/gpu/shaders/bins.h b/gpu/shaders/bins.h
new file mode 100644
index 00000000..853adabe
--- /dev/null
+++ b/gpu/shaders/bins.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct BinInstanceRef {
+    uint offset;
+};
+
+struct BinInstance {
+    uint element_ix;
+};
+
+#define BinInstance_size 4
+
+BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
+    return BinInstanceRef(ref.offset + index * BinInstance_size);
+}
+
+BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    BinInstance s;
+    s.element_ix = raw0;
+    return s;
+}
+
+void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.element_ix);
+}
+
diff --git a/gpu/shaders/coarse.comp b/gpu/shaders/coarse.comp
new file mode 100644
index 00000000..cbc69307
--- /dev/null
+++ b/gpu/shaders/coarse.comp
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The coarse rasterizer stage of the pipeline.
+//
+// As input we have the ordered partitions of paths from the binning phase and
+// the annotated tile list of segments and backdrop per path.
+//
+// Each workgroup operating on one bin by stream compacting
+// the elements corresponding to the bin.
+//
+// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+layout(local_size_x = N_TILE, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+#include "bins.h"
+#include "tile.h"
+#include "ptcl.h"
+
+#define LG_N_PART_READ (7 + LG_WG_FACTOR)
+#define N_PART_READ (1 << LG_N_PART_READ)
+
+shared uint sh_elements[N_TILE];
+
+// Number of elements in the partition; prefix sum.
+shared uint sh_part_count[N_PART_READ];
+shared Alloc sh_part_elements[N_PART_READ];
+
+shared uint sh_bitmaps[N_SLICE][N_TILE];
+
+shared uint sh_tile_count[N_TILE];
+// The width of the tile rect for the element, intersected with this bin
+shared uint sh_tile_width[N_TILE];
+shared uint sh_tile_x0[N_TILE];
+shared uint sh_tile_y0[N_TILE];
+
+// These are set up so base + tile_y * stride + tile_x points to a Tile.
+shared uint sh_tile_base[N_TILE];
+shared uint sh_tile_stride[N_TILE];
+
+#ifdef MEM_DEBUG
+// Store allocs only when MEM_DEBUG to save shared memory traffic.
+shared Alloc sh_tile_alloc[N_TILE];
+
+void write_tile_alloc(uint el_ix, Alloc a) {
+    sh_tile_alloc[el_ix] = a;
+}
+
+Alloc read_tile_alloc(uint el_ix) {
+    return sh_tile_alloc[el_ix];
+}
+#else
+void write_tile_alloc(uint el_ix, Alloc a) {
+    // No-op
+}
+
+Alloc read_tile_alloc(uint el_ix) {
+    // All memory.
+    return new_alloc(0, memory.length()*4);
+}
+#endif
+
+// Perhaps cmd_alloc should be a global? This is a style question.
+bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset < cmd_limit) {
+        return true;
+    }
+    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(new_cmd.alloc.offset);
+    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
+    cmd_alloc = new_cmd.alloc;
+    cmd_ref = CmdRef(cmd_alloc.offset);
+    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    return true;
+}
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    // Could use either linear or 2d layouts for both dispatch and
+    // invocations within the workgroup. We'll use variables to abstract.
+    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
+    uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
+    uint partition_ix = 0;
+    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
+    uint th_ix = gl_LocalInvocationID.x;
+
+    // Coordinates of top left of bin, in tiles.
+    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
+    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
+
+    // Per-tile state
+    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
+    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
+    uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
+    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    // The nesting depth of the clip stack
+    uint clip_depth = 0;
+    // State for the "clip zero" optimization. If it's nonzero, then we are
+    // currently in a clip for which the entire tile has an alpha of zero, and
+    // the value is the depth after the "begin clip" of that element.
+    uint clip_zero_depth = 0;
+    // State for the "clip one" optimization. If bit `i` is set, then that means
+    // that the clip pushed at depth `i` has an alpha of all one.
+    uint clip_one_mask = 0;
+
+    // I'm sure we can figure out how to do this with at least one fewer register...
+    // Items up to rd_ix have been read from sh_elements
+    uint rd_ix = 0;
+    // Items up to wr_ix have been written into sh_elements
+    uint wr_ix = 0;
+    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
+    uint part_start_ix = 0;
+    uint ready_ix = 0;
+
+    while (true) {
+        for (uint i = 0; i < N_SLICE; i++) {
+            sh_bitmaps[i][th_ix] = 0;
+        }
+
+        // parallel read of input partitions
+        do {
+            if (ready_ix == wr_ix && partition_ix < n_partitions) {
+                part_start_ix = ready_ix;
+                uint count = 0;
+                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
+                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = read_mem(conf.bin_alloc, in_ix);
+                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
+                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
+                }
+                // prefix sum of counts
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    if (th_ix < N_PART_READ) {
+                        sh_part_count[th_ix] = count;
+                    }
+                    barrier();
+                    if (th_ix < N_PART_READ) {
+                        if (th_ix >= (1 << i)) {
+                            count += sh_part_count[th_ix - (1 << i)];
+                        }
+                    }
+                    barrier();
+                }
+                if (th_ix < N_PART_READ) {
+                    sh_part_count[th_ix] = part_start_ix + count;
+                }
+                barrier();
+                ready_ix = sh_part_count[N_PART_READ - 1];
+                partition_ix += N_PART_READ;
+            }
+            // use binary search to find element to read
+            uint ix = rd_ix + th_ix;
+            if (ix >= wr_ix && ix < ready_ix) {
+                uint part_ix = 0;
+                for (uint i = 0; i < LG_N_PART_READ; i++) {
+                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
+                    if (ix >= sh_part_count[probe - 1]) {
+                        part_ix = probe;
+                    }
+                }
+                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
+                Alloc bin_alloc = sh_part_elements[part_ix];
+                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
+                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
+                sh_elements[th_ix] = inst.element_ix;
+            }
+            barrier();
+
+            wr_ix = min(rd_ix + N_TILE, ready_ix);
+        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
+
+        // We've done the merge and filled the buffer.
+
+        // Read one element, compute coverage.
+        uint tag = Annotated_Nop;
+        uint element_ix;
+        AnnotatedRef ref;
+        if (th_ix + rd_ix < wr_ix) {
+            element_ix = sh_elements[th_ix];
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            tag = Annotated_tag(conf.anno_alloc, ref);
+        }
+
+        // Bounding box of element in pixel coordinates.
+        uint tile_count;
+        switch (tag) {
+        case Annotated_Fill:
+        case Annotated_FillTexture:
+        case Annotated_Stroke:
+        case Annotated_BeginClip:
+        case Annotated_EndClip:
+            // We have one "path" for each element, even if the element isn't
+            // actually a path (currently EndClip, but images etc in the future).
+            uint path_ix = element_ix;
+            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
+            uint stride = path.bbox.z - path.bbox.x;
+            sh_tile_stride[th_ix] = stride;
+            int dx = int(path.bbox.x) - int(bin_tile_x);
+            int dy = int(path.bbox.y) - int(bin_tile_y);
+            int x0 = clamp(dx, 0, N_TILE_X);
+            int y0 = clamp(dy, 0, N_TILE_Y);
+            int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
+            int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
+            sh_tile_width[th_ix] = uint(x1 - x0);
+            sh_tile_x0[th_ix] = x0;
+            sh_tile_y0[th_ix] = y0;
+            tile_count = uint(x1 - x0) * uint(y1 - y0);
+            // base relative to bin
+            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
+            sh_tile_base[th_ix] = base;
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            write_tile_alloc(th_ix, path_alloc);
+            break;
+        default:
+            tile_count = 0;
+            break;
+        }
+
+        // Prefix sum of sh_tile_count
+        sh_tile_count[th_ix] = tile_count;
+        for (uint i = 0; i < LG_N_TILE; i++) {
+            barrier();
+            if (th_ix >= (1 << i)) {
+                tile_count += sh_tile_count[th_ix - (1 << i)];
+            }
+            barrier();
+            sh_tile_count[th_ix] = tile_count;
+        }
+        barrier();
+        uint total_tile_count = sh_tile_count[N_TILE - 1];
+        for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
+            // Binary search to find element
+            uint el_ix = 0;
+            for (uint i = 0; i < LG_N_TILE; i++) {
+                uint probe = el_ix + ((N_TILE / 2) >> i);
+                if (ix >= sh_tile_count[probe - 1]) {
+                    el_ix = probe;
+                }
+            }
+            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
+            uint tag = Annotated_tag(conf.anno_alloc, ref);
+            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
+            uint width = sh_tile_width[el_ix];
+            uint x = sh_tile_x0[el_ix] + seq_ix % width;
+            uint y = sh_tile_y0[el_ix] + seq_ix / width;
+            bool include_tile;
+            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
+                include_tile = true;
+            } else {
+                Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+                // Include the path in the tile if
+                // - the tile contains at least a segment (tile offset non-zero)
+                // - the tile is completely covered (backdrop non-zero)
+                include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
+            }
+            if (include_tile) {
+                uint el_slice = el_ix / 32;
+                uint el_mask = 1 << (el_ix & 31);
+                atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
+            }
+        }
+
+        barrier();
+
+        // Output non-segment elements for this tile. The thread does a sequential walk
+        // through the non-segment elements.
+        uint slice_ix = 0;
+        uint bitmap = sh_bitmaps[0][th_ix];
+        while (true) {
+            if (bitmap == 0) {
+                slice_ix++;
+                if (slice_ix == N_SLICE) {
+                    break;
+                }
+                bitmap = sh_bitmaps[slice_ix][th_ix];
+                if (bitmap == 0) {
+                    continue;
+                }
+            }
+            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
+            uint element_ix = sh_elements[element_ref_ix];
+
+            // Clear LSB
+            bitmap &= bitmap - 1;
+
+            // At this point, we read the element again from global memory.
+            // If that turns out to be expensive, maybe we can pack it into
+            // shared memory (or perhaps just the tag).
+            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+            tag = Annotated_tag(conf.anno_alloc, ref);
+
+            if (clip_zero_depth == 0) {
+                switch (tag) {
+                case Annotated_Fill:
+                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    if (tile.tile.offset != 0) {
+                        CmdFill cmd_fill;
+                        cmd_fill.tile_ref = tile.tile.offset;
+                        cmd_fill.backdrop = tile.backdrop;
+                        cmd_fill.rgba_color = fill.rgba_color;
+                        Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
+                    } else {
+                        Cmd_Solid_write(cmd_alloc, cmd_ref, CmdSolid(fill.rgba_color));
+                    }
+                    cmd_ref.offset += Cmd_size;
+                    break;
+                case Annotated_FillTexture:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoFillTexture fill_tex = Annotated_FillTexture_read(conf.anno_alloc, ref);
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    if (tile.tile.offset != 0) {
+                        CmdFillTexture cmd_fill_tex;
+                        cmd_fill_tex.tile_ref = tile.tile.offset;
+                        cmd_fill_tex.backdrop = tile.backdrop;
+                        cmd_fill_tex.mat = fill_tex.mat;
+                        cmd_fill_tex.translate = fill_tex.translate;
+                        cmd_fill_tex.uv_bounds = fill_tex.uv_bounds;
+                        Cmd_FillTexture_write(cmd_alloc, cmd_ref, cmd_fill_tex);
+                    } else {
+                        CmdSolidTexture cmd_solid_tex;
+                        cmd_solid_tex.mat = fill_tex.mat;
+                        cmd_solid_tex.translate = fill_tex.translate;
+                        cmd_solid_tex.uv_bounds = fill_tex.uv_bounds;
+                        Cmd_SolidTexture_write(cmd_alloc, cmd_ref, cmd_solid_tex);
+                    }
+                    cmd_ref.offset += Cmd_size;
+                    break;
+                case Annotated_BeginClip:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
+                        clip_zero_depth = clip_depth + 1;
+                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
+                        clip_one_mask |= (1 << clip_depth);
+                    } else {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        if (tile.tile.offset != 0) {
+                            CmdBeginClip cmd_begin_clip;
+                            cmd_begin_clip.tile_ref = tile.tile.offset;
+                            cmd_begin_clip.backdrop = tile.backdrop;
+                            Cmd_BeginClip_write(cmd_alloc, cmd_ref, cmd_begin_clip);
+                        } else {
+                            // TODO: here is where a bunch of optimization magic should happen
+                            float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
+                            Cmd_BeginSolidClip_write(cmd_alloc, cmd_ref, CmdBeginSolidClip(alpha));
+                        }
+                        cmd_ref.offset += Cmd_size;
+                        if (clip_depth < 32) {
+                            clip_one_mask &= ~(1 << clip_depth);
+                        }
+                    }
+                    clip_depth++;
+                    break;
+                case Annotated_EndClip:
+                    clip_depth--;
+                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
+                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                            break;
+                        }
+                        Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(1.0));
+                        cmd_ref.offset += Cmd_size;
+                    }
+                    break;
+                case Annotated_Stroke:
+                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
+                    AnnoStroke stroke = Annotated_Stroke_read(conf.anno_alloc, ref);
+                    CmdStroke cmd_stroke;
+                    cmd_stroke.tile_ref = tile.tile.offset;
+                    cmd_stroke.half_width = 0.5 * stroke.linewidth;
+                    cmd_stroke.rgba_color = stroke.rgba_color;
+                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
+                        break;
+                    }
+                    Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
+                    cmd_ref.offset += Cmd_size;
+                    break;
+                }
+            } else {
+                // In "clip zero" state, suppress all drawing
+                switch (tag) {
+                case Annotated_BeginClip:
+                    clip_depth++;
+                    break;
+                case Annotated_EndClip:
+                    if (clip_depth == clip_zero_depth) {
+                        clip_zero_depth = 0;
+                    }
+                    clip_depth--;
+                    break;
+                }
+            }
+        }
+        barrier();
+
+        rd_ix += N_TILE;
+        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
+    }
+    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
+        Cmd_End_write(cmd_alloc, cmd_ref);
+    }
+}
diff --git a/gpu/shaders/elements.comp b/gpu/shaders/elements.comp
new file mode 100644
index 00000000..a43c270f
--- /dev/null
+++ b/gpu/shaders/elements.comp
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// The element processing stage, first in the pipeline.
+//
+// This stage is primarily about applying transforms and computing bounding
+// boxes. It is organized as a scan over the input elements, producing
+// annotated output elements.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define N_ROWS 4
+#define WG_SIZE 32
+#define LG_WG_SIZE 5
+#define PARTITION_SIZE (WG_SIZE * N_ROWS)
+
+layout(local_size_x = WG_SIZE, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(set = 0, binding = 2) readonly buffer SceneBuf {
+    uint[] scene;
+};
+
+// It would be better to use the Vulkan memory model than
+// "volatile" but shooting for compatibility here rather
+// than doing things right.
+layout(set = 0, binding = 3) volatile buffer StateBuf {
+    uint part_counter;
+    uint[] state;
+};
+
+#include "scene.h"
+#include "state.h"
+#include "annotated.h"
+#include "pathseg.h"
+
+#define StateBuf_stride (4 + 2 * State_size)
+
+StateRef state_aggregate_ref(uint partition_ix) {
+    return StateRef(4 + partition_ix * StateBuf_stride);
+}
+
+StateRef state_prefix_ref(uint partition_ix) {
+    return StateRef(4 + partition_ix * StateBuf_stride + State_size);
+}
+
+uint state_flag_index(uint partition_ix) {
+    return partition_ix * (StateBuf_stride / 4);
+}
+
+// These correspond to X, A, P respectively in the prefix sum paper.
+#define FLAG_NOT_READY 0
+#define FLAG_AGGREGATE_READY 1
+#define FLAG_PREFIX_READY 2
+
+#define FLAG_SET_LINEWIDTH 1
+#define FLAG_SET_BBOX 2
+#define FLAG_RESET_BBOX 4
+
+// This is almost like a monoid (the interaction between transformation and
+// bounding boxes is approximate)
+State combine_state(State a, State b) {
+    State c;
+    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
+    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
+    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
+        c.bbox = a.bbox;
+    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
+        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
+    {
+        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
+        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
+    }
+    // It would be more concise to cast to matrix types; ah well.
+    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
+    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
+    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
+    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
+    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
+    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
+    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
+    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
+    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
+    c.path_count = a.path_count + b.path_count;
+    c.pathseg_count = a.pathseg_count + b.pathseg_count;
+    return c;
+}
+
+State map_element(ElementRef ref) {
+    // TODO: it would *probably* be more efficient to make the memory read patterns less
+    // divergent, though it would be more wasted memory.
+    uint tag = Element_tag(ref);
+    State c;
+    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    c.translate = vec2(0.0, 0.0);
+    c.linewidth = 1.0; // TODO should be 0.0
+    c.flags = 0;
+    c.path_count = 0;
+    c.pathseg_count = 0;
+    switch (tag) {
+    case Element_FillLine:
+    case Element_StrokeLine:
+        LineSeg line = Element_FillLine_read(ref);
+        c.bbox.xy = min(line.p0, line.p1);
+        c.bbox.zw = max(line.p0, line.p1);
+        c.pathseg_count = 1;
+        break;
+    case Element_FillQuad:
+    case Element_StrokeQuad:
+        QuadSeg quad = Element_FillQuad_read(ref);
+        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
+        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
+        c.pathseg_count = 1;
+        break;
+    case Element_FillCubic:
+    case Element_StrokeCubic:
+        CubicSeg cubic = Element_FillCubic_read(ref);
+        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
+        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
+        c.pathseg_count = 1;
+        break;
+    case Element_Fill:
+    case Element_FillTexture:
+    case Element_Stroke:
+    case Element_BeginClip:
+        c.flags = FLAG_RESET_BBOX;
+        c.path_count = 1;
+        break;
+    case Element_EndClip:
+        c.path_count = 1;
+        break;
+    case Element_SetLineWidth:
+        SetLineWidth lw = Element_SetLineWidth_read(ref);
+        c.linewidth = lw.width;
+        c.flags = FLAG_SET_LINEWIDTH;
+        break;
+    case Element_Transform:
+        Transform t = Element_Transform_read(ref);
+        c.mat = t.mat;
+        c.translate = t.translate;
+        break;
+    }
+    return c;
+}
+
+// Get the bounding box of a circle transformed by the matrix into an ellipse.
+vec2 get_linewidth(State st) {
+    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
+    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
+}
+
+// We should be able to use an array of structs but the NV shader compiler
+// doesn't seem to like it :/
+//shared State sh_state[WG_SIZE];
+shared vec4 sh_mat[WG_SIZE];
+shared vec2 sh_translate[WG_SIZE];
+shared vec4 sh_bbox[WG_SIZE];
+shared float sh_width[WG_SIZE];
+shared uint sh_flags[WG_SIZE];
+shared uint sh_path_count[WG_SIZE];
+shared uint sh_pathseg_count[WG_SIZE];
+
+shared uint sh_part_ix;
+shared State sh_prefix;
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    State th_state[N_ROWS];
+    // Determine partition to process by atomic counter (described in Section
+    // 4.4 of prefix sum paper).
+    if (gl_LocalInvocationID.x == 0) {
+        sh_part_ix = atomicAdd(part_counter, 1);
+    }
+    barrier();
+    uint part_ix = sh_part_ix;
+
+    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
+    ElementRef ref = ElementRef(ix * Element_size);
+
+    th_state[0] = map_element(ref);
+    for (uint i = 1; i < N_ROWS; i++) {
+        // discussion question: would it be faster to load using more coherent patterns
+        // into thread memory? This is kinda strided.
+        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
+    }
+    State agg = th_state[N_ROWS - 1];
+    sh_mat[gl_LocalInvocationID.x] = agg.mat;
+    sh_translate[gl_LocalInvocationID.x] = agg.translate;
+    sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+    sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+    sh_flags[gl_LocalInvocationID.x] = agg.flags;
+    sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
+    sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
+    for (uint i = 0; i < LG_WG_SIZE; i++) {
+        barrier();
+        if (gl_LocalInvocationID.x >= (1 << i)) {
+            State other;
+            uint ix = gl_LocalInvocationID.x - (1 << i);
+            other.mat = sh_mat[ix];
+            other.translate = sh_translate[ix];
+            other.bbox = sh_bbox[ix];
+            other.linewidth = sh_width[ix];
+            other.flags = sh_flags[ix];
+            other.path_count = sh_path_count[ix];
+            other.pathseg_count = sh_pathseg_count[ix];
+            agg = combine_state(other, agg);
+        }
+        barrier();
+        sh_mat[gl_LocalInvocationID.x] = agg.mat;
+        sh_translate[gl_LocalInvocationID.x] = agg.translate;
+        sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
+        sh_width[gl_LocalInvocationID.x] = agg.linewidth;
+        sh_flags[gl_LocalInvocationID.x] = agg.flags;
+        sh_path_count[gl_LocalInvocationID.x] = agg.path_count;
+        sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count;
+    }
+
+    State exclusive;
+    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
+    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
+    exclusive.translate = vec2(0.0, 0.0);
+    exclusive.linewidth = 1.0; //TODO should be 0.0
+    exclusive.flags = 0;
+    exclusive.path_count = 0;
+    exclusive.pathseg_count = 0;
+
+    // Publish aggregate for this partition
+    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+        // Note: with memory model, we'd want to generate the atomic store version of this.
+        State_write(state_aggregate_ref(part_ix), agg);
+        uint flag = FLAG_AGGREGATE_READY;
+        memoryBarrierBuffer();
+        if (part_ix == 0) {
+            State_write(state_prefix_ref(part_ix), agg);
+            flag = FLAG_PREFIX_READY;
+        }
+        state[state_flag_index(part_ix)] = flag;
+        if (part_ix != 0) {
+            // step 4 of paper: decoupled lookback
+            uint look_back_ix = part_ix - 1;
+
+            State their_agg;
+            uint their_ix = 0;
+            while (true) {
+                flag = state[state_flag_index(look_back_ix)];
+                if (flag == FLAG_PREFIX_READY) {
+                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
+                    exclusive = combine_state(their_prefix, exclusive);
+                    break;
+                } else if (flag == FLAG_AGGREGATE_READY) {
+                    their_agg = State_read(state_aggregate_ref(look_back_ix));
+                    exclusive = combine_state(their_agg, exclusive);
+                    look_back_ix--;
+                    their_ix = 0;
+                    continue;
+                }
+                // else spin
+
+                // Unfortunately there's no guarantee of forward progress of other
+                // workgroups, so compute a bit of the aggregate before trying again.
+                // In the worst case, spinning stops when the aggregate is complete.
+                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
+                State s = map_element(ref);
+                if (their_ix == 0) {
+                    their_agg = s;
+                } else {
+                    their_agg = combine_state(their_agg, s);
+                }
+                their_ix++;
+                if (their_ix == PARTITION_SIZE) {
+                    exclusive = combine_state(their_agg, exclusive);
+                    if (look_back_ix == 0) {
+                        break;
+                    }
+                    look_back_ix--;
+                    their_ix = 0;
+                }
+            }
+
+            // step 5 of paper: compute inclusive prefix
+            State inclusive_prefix = combine_state(exclusive, agg);
+            sh_prefix = exclusive;
+            State_write(state_prefix_ref(part_ix), inclusive_prefix);
+            memoryBarrierBuffer();
+            flag = FLAG_PREFIX_READY;
+            state[state_flag_index(part_ix)] = flag;
+        }
+    }
+    barrier();
+    if (part_ix != 0) {
+        exclusive = sh_prefix;
+    }
+
+    State row = exclusive;
+    if (gl_LocalInvocationID.x > 0) {
+        uint ix = gl_LocalInvocationID.x - 1;
+        State other;
+        other.mat = sh_mat[ix];
+        other.translate = sh_translate[ix];
+        other.bbox = sh_bbox[ix];
+        other.linewidth = sh_width[ix];
+        other.flags = sh_flags[ix];
+        other.path_count = sh_path_count[ix];
+        other.pathseg_count = sh_pathseg_count[ix];
+        row = combine_state(row, other);
+    }
+    for (uint i = 0; i < N_ROWS; i++) {
+        State st = combine_state(row, th_state[i]);
+
+        // Here we read again from the original scene. There may be
+        // gains to be had from stashing in shared memory or possibly
+        // registers (though register pressure is an issue).
+        ElementRef this_ref = Element_index(ref, i);
+        uint tag = Element_tag(this_ref);
+        switch (tag) {
+        case Element_FillLine:
+        case Element_StrokeLine:
+            LineSeg line = Element_StrokeLine_read(this_ref);
+            vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
+            vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
+            PathStrokeCubic path_cubic;
+            path_cubic.p0 = p0;
+            path_cubic.p1 = mix(p0, p1, 1.0 / 3.0);
+            path_cubic.p2 = mix(p1, p0, 1.0 / 3.0);
+            path_cubic.p3 = p1;
+            path_cubic.path_ix = st.path_count;
+            if (tag == Element_StrokeLine) {
+                path_cubic.stroke = get_linewidth(st);
+            } else {
+                path_cubic.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            break;
+        case Element_FillQuad:
+        case Element_StrokeQuad:
+            QuadSeg quad = Element_StrokeQuad_read(this_ref);
+            p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate;
+            p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate;
+            vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate;
+            path_cubic;
+            path_cubic.p0 = p0;
+            path_cubic.p1 = mix(p1, p0, 1.0 / 3.0);
+            path_cubic.p2 = mix(p1, p2, 1.0 / 3.0);
+            path_cubic.p3 = p2;
+            path_cubic.path_ix = st.path_count;
+            if (tag == Element_StrokeQuad) {
+                path_cubic.stroke = get_linewidth(st);
+            } else {
+                path_cubic.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            break;
+        case Element_FillCubic:
+        case Element_StrokeCubic:
+            CubicSeg cubic = Element_StrokeCubic_read(this_ref);
+            path_cubic;
+            path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate;
+            path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate;
+            path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate;
+            path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate;
+            path_cubic.path_ix = st.path_count;
+            if (tag == Element_StrokeCubic) {
+                path_cubic.stroke = get_linewidth(st);
+            } else {
+                path_cubic.stroke = vec2(0.0);
+            }
+            // We do encoding a bit by hand to minimize divergence. Another approach
+            // would be to have a fill/stroke bool.
+            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
+            out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
+            write_mem(conf.pathseg_alloc, path_out_ref.offset >> 2, out_tag);
+            PathStrokeCubic_write(conf.pathseg_alloc, PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
+            break;
+        case Element_Stroke:
+            Stroke stroke = Element_Stroke_read(this_ref);
+            AnnoStroke anno_stroke;
+            anno_stroke.rgba_color = stroke.rgba_color;
+            vec2 lw = get_linewidth(st);
+            anno_stroke.bbox = st.bbox + vec4(-lw, lw);
+            anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
+            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Stroke_write(conf.anno_alloc, out_ref, anno_stroke);
+            break;
+        case Element_Fill:
+            Fill fill = Element_Fill_read(this_ref);
+            AnnoFill anno_fill;
+            anno_fill.rgba_color = fill.rgba_color;
+            anno_fill.bbox = st.bbox;
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_Fill_write(conf.anno_alloc, out_ref, anno_fill);
+            break;
+        case Element_FillTexture:
+            FillTexture fill_tex = Element_FillTexture_read(this_ref);
+            AnnoFillTexture anno_fill_tex;
+            anno_fill_tex.uv_bounds = fill_tex.uv_bounds;
+            anno_fill_tex.bbox = st.bbox;
+            anno_fill_tex.mat = st.mat;
+            anno_fill_tex.translate = st.translate;
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_FillTexture_write(conf.anno_alloc, out_ref, anno_fill_tex);
+            break;
+        case Element_BeginClip:
+            Clip begin_clip = Element_BeginClip_read(this_ref);
+            AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
+            // This is the absolute bbox, it's been transformed during encoding.
+            anno_begin_clip.bbox = begin_clip.bbox;
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_BeginClip_write(conf.anno_alloc, out_ref, anno_begin_clip);
+            break;
+        case Element_EndClip:
+            Clip end_clip = Element_EndClip_read(this_ref);
+            // This bbox is expected to be the same as the begin one.
+            AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
+            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
+            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
+            break;
+        }
+    }
+}
diff --git a/gpu/shaders/kernel4.comp b/gpu/shaders/kernel4.comp
new file mode 100644
index 00000000..d5b44d1f
--- /dev/null
+++ b/gpu/shaders/kernel4.comp
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
+// in the per-tile command list to an image.
+
+// Right now, this kernel stores the image in a buffer, but a better
+// plan is to use a texture. This is because of limited support.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+#ifdef VULKAN
+#extension GL_EXT_nonuniform_qualifier : enable
+#endif
+
+#include "mem.h"
+#include "setup.h"
+
+#define CHUNK 8
+#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
+layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
+
+#ifdef VULKAN
+layout(set = 0, binding = 3) uniform sampler2D textures[];
+#else
+layout(set = 0, binding = 3) uniform sampler2D atlas;
+#endif
+
+#include "ptcl.h"
+#include "tile.h"
+
+#define BLEND_STACK_SIZE 4
+
+// Layout of a clip scratch frame:
+// Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
+
+// Link offset and frame size in 32-bit words.
+#define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
+#define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
+
+shared MallocResult sh_clip_alloc;
+
+// Allocate a scratch buffer for clipping.
+MallocResult alloc_clip_buf(uint link) {
+    if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
+        MallocResult m = malloc(CLIP_BUF_SIZE * 4);
+        if (!m.failed) {
+            write_mem(m.alloc, (m.alloc.offset >> 2) + CLIP_LINK_OFFSET, link);
+        }
+        sh_clip_alloc = m;
+    }
+    barrier();
+    return sh_clip_alloc;
+}
+
+// Calculate coverage based on backdrop + coverage of each line segment
+float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
+    // Probably better to store as float, but conversion is no doubt cheap.
+    float area[CHUNK];
+    for (uint k = 0; k < CHUNK; k++) area[k] = float(backdrop);
+    TileSegRef tile_seg_ref = TileSegRef(tile_ref);
+    do {
+        TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
+        for (uint k = 0; k < CHUNK; k++) {
+            vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
+            vec2 start = seg.origin - my_xy;
+            vec2 end = start + seg.vector;
+            vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
+            if (window.x != window.y) {
+                vec2 t = (window - start.y) / seg.vector.y;
+                vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
+                float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
+                float xmax = max(xs.x, xs.y);
+                float b = min(xmax, 1.0);
+                float c = max(b, 0.0);
+                float d = max(xmin, 0.0);
+                float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                area[k] += a * (window.x - window.y);
+            }
+            area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
+        }
+        tile_seg_ref = seg.next;
+    } while (tile_seg_ref.offset != 0);
+    for (uint k = 0; k < CHUNK; k++) {
+        area[k] = min(abs(area[k]), 1.0);
+    }
+    return area;
+}
+
+vec4[CHUNK] fillTexture(vec2 xy, CmdSolidTexture cmd_tex) {
+    vec2 uvmin = unpackUnorm2x16(cmd_tex.uv_bounds.x);
+    vec2 uvmax = unpackUnorm2x16(cmd_tex.uv_bounds.y);
+    vec4 rgba[CHUNK];
+    for (uint i = 0; i < CHUNK; i++) {
+        float dy = float(i * CHUNK_DY);
+        vec2 uv = vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5);
+        uv = cmd_tex.mat.xy * uv.x + cmd_tex.mat.zw * uv.y + cmd_tex.translate;
+        uv = clamp(uv, uvmin, uvmax);
+#ifdef VULKAN
+        vec4 fg_rgba = textureGrad(textures[0], uv, cmd_tex.mat.xy, cmd_tex.mat.zw);
+#else
+        vec4 fg_rgba = textureGrad(atlas, uv, cmd_tex.mat.xy, cmd_tex.mat.zw);
+#endif
+        rgba[i] = fg_rgba;
+    }
+    return rgba;
+}
+
+vec3 tosRGB(vec3 rgb) {
+    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
+    vec3 below = vec3(12.92)*rgb;
+    vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
+    return mix(below, above, cutoff);
+}
+
+// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
+// space.
+vec4 unpacksRGB(uint srgba) {
+    vec4 color = unpackUnorm4x8(srgba).wzyx;
+    // Formula from EXT_sRGB.
+    vec3 rgb = color.rgb;
+    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.04045));
+    vec3 below = rgb/vec3(12.92);
+    vec3 above = pow((rgb + vec3(0.055))/vec3(1.055), vec3(2.4));
+    rgb = mix(below, above, cutoff);
+    return vec4(rgb, color.a);
+}
+
+// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
+uint packsRGB(vec4 rgba) {
+    rgba = vec4(tosRGB(rgba.rgb), rgba.a);
+    return packUnorm4x8(rgba.wzyx);
+}
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
+    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
+
+    uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
+    vec2 xy = vec2(xy_uint);
+    vec3 rgb[CHUNK];
+    float mask[CHUNK];
+    uint blend_stack[BLEND_STACK_SIZE][CHUNK];
+    uint blend_spill = 0;
+    uint blend_sp = 0;
+    Alloc clip_tos = new_alloc(0, 0);
+    for (uint i = 0; i < CHUNK; i++) {
+        rgb[i] = vec3(0.5);
+#ifdef VULKAN
+        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
+            rgb[i] = texture(textures[gl_WorkGroupID.x / 64], vec2(xy_uint.x, xy_uint.y + CHUNK_DY * i) / 1024.0).rgb;
+        }
+#endif
+        mask[i] = 1.0;
+    }
+
+    while (true) {
+        uint tag = Cmd_tag(cmd_alloc, cmd_ref);
+        if (tag == Cmd_End) {
+            break;
+        }
+        switch (tag) {
+        case Cmd_Circle:
+            CmdCircle circle = Cmd_Circle_read(cmd_alloc, cmd_ref);
+            vec4 fg_rgba = unpacksRGB(circle.rgba_color);
+            for (uint i = 0; i < CHUNK; i++) {
+                float dy = float(i * CHUNK_DY);
+                float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
+                float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
+                rgb[i] = mix(rgb[i], fg_rgba.rgb, mask[i] * alpha * fg_rgba.a);
+            }
+            break;
+        case Cmd_Stroke:
+            // Calculate distance field from all the line segments in this tile.
+            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
+            float df[CHUNK];
+            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
+            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
+            do {
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
+                vec2 line_vec = seg.vector;
+                for (uint k = 0; k < CHUNK; k++) {
+                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
+                    dpos.y += float(k * CHUNK_DY);
+                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
+                    df[k] = min(df[k], length(line_vec * t - dpos));
+                }
+                tile_seg_ref = seg.next;
+            } while (tile_seg_ref.offset != 0);
+            fg_rgba = unpacksRGB(stroke.rgba_color);
+            for (uint k = 0; k < CHUNK; k++) {
+                float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * alpha * fg_rgba.a);
+            }
+            break;
+        case Cmd_Fill:
+            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
+            float area[CHUNK];
+            area = computeArea(xy, fill.backdrop, fill.tile_ref);
+            fg_rgba = unpacksRGB(fill.rgba_color);
+            for (uint k = 0; k < CHUNK; k++) {
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * area[k] * fg_rgba.a);
+            }
+            break;
+        case Cmd_FillTexture:
+            CmdFillTexture fill_tex = Cmd_FillTexture_read(cmd_alloc, cmd_ref);
+            area = computeArea(xy, fill_tex.backdrop, fill_tex.tile_ref);
+            vec4 rgba[CHUNK] = fillTexture(xy, CmdSolidTexture(fill_tex.mat, fill_tex.translate, fill_tex.uv_bounds));
+            for (uint k = 0; k < CHUNK; k++) {
+                rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * area[k] * rgba[k].a);
+            }
+            break;
+        case Cmd_BeginClip:
+        case Cmd_BeginSolidClip:
+            uint blend_slot = blend_sp % BLEND_STACK_SIZE;
+            if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
+                // spill to scratch buffer
+                MallocResult m = alloc_clip_buf(clip_tos.offset);
+                if (m.failed) {
+                    return;
+                }
+                clip_tos = m.alloc;
+                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                for (uint k = 0; k < CHUNK; k++) {
+                    write_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY, blend_stack[blend_slot][k]);
+                }
+                blend_spill++;
+            }
+            if (tag == Cmd_BeginClip) {
+                CmdBeginClip begin_clip = Cmd_BeginClip_read(cmd_alloc, cmd_ref);
+                area = computeArea(xy, begin_clip.backdrop, begin_clip.tile_ref);
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], clamp(abs(area[k]), 0.0, 1.0)));
+                }
+            } else {
+                CmdBeginSolidClip begin_solid_clip = Cmd_BeginSolidClip_read(cmd_alloc, cmd_ref);
+                float solid_alpha = begin_solid_clip.alpha;
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = packsRGB(vec4(rgb[k], solid_alpha));
+                }
+            }
+            blend_sp++;
+            break;
+        case Cmd_EndClip:
+            CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref);
+            blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
+            if (blend_sp == blend_spill) {
+                uint base_ix = (clip_tos.offset >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                for (uint k = 0; k < CHUNK; k++) {
+                    blend_stack[blend_slot][k] = read_mem(clip_tos, base_ix + k * TILE_WIDTH_PX * CHUNK_DY);
+                }
+                clip_tos.offset = read_mem(clip_tos, (clip_tos.offset >> 2) + CLIP_LINK_OFFSET);
+                blend_spill--;
+            }
+            blend_sp--;
+            for (uint k = 0; k < CHUNK; k++) {
+                vec4 rgba = unpacksRGB(blend_stack[blend_slot][k]);
+                rgb[k] = mix(rgba.rgb, rgb[k], end_clip.alpha * rgba.a);
+            }
+            break;
+        case Cmd_Solid:
+            CmdSolid solid = Cmd_Solid_read(cmd_alloc, cmd_ref);
+            fg_rgba = unpacksRGB(solid.rgba_color);
+            for (uint k = 0; k < CHUNK; k++) {
+                rgb[k] = mix(rgb[k], fg_rgba.rgb, mask[k] * fg_rgba.a);
+            }
+            break;
+        case Cmd_SolidTexture:
+            CmdSolidTexture solid_tex = Cmd_SolidTexture_read(cmd_alloc, cmd_ref);
+            rgba = fillTexture(xy, solid_tex);
+            for (uint k = 0; k < CHUNK; k++) {
+                rgb[k] = mix(rgb[k], rgba[k].rgb, mask[k] * rgba[k].a);
+            }
+            break;
+        case Cmd_SolidMask:
+            CmdSolidMask solid_mask = Cmd_SolidMask_read(cmd_alloc, cmd_ref);
+            for (uint k = 0; k < CHUNK; k++) {
+                mask[k] = solid_mask.mask;
+            }
+            break;
+        case Cmd_Jump:
+            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
+            cmd_alloc.offset = cmd_ref.offset;
+            continue;
+        }
+        cmd_ref.offset += Cmd_size;
+    }
+
+    for (uint i = 0; i < CHUNK; i++) {
+        imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(tosRGB(rgb[i]), 1.0));
+    }
+}
diff --git a/gpu/shaders/mem.h b/gpu/shaders/mem.h
new file mode 100644
index 00000000..bc851906
--- /dev/null
+++ b/gpu/shaders/mem.h
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+layout(set = 0, binding = 0) buffer Memory {
+    // offset into memory of the next allocation, initialized by the user.
+    uint mem_offset;
+    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
+    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
+    // If MEM_DEBUG is defined the following errors are reported:
+    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
+    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
+    uint mem_error;
+    uint[] memory;
+};
+
+// Uncomment this line to add the size field to Alloc and enable memory checks.
+// Note that the Config struct in setup.h grows size fields as well.
+//#define MEM_DEBUG
+
+#define NO_ERROR 0
+#define ERR_MALLOC_FAILED 1
+#define ERR_OUT_OF_BOUNDS 2
+#define ERR_UNALIGNED_ACCESS 3
+
+#define Alloc_size 8
+
+// Alloc represents a memory allocation.
+struct Alloc {
+    // offset in bytes into memory.
+    uint offset;
+#ifdef MEM_DEBUG
+    // size in bytes of the allocation.
+    uint size;
+#endif
+};
+
+struct MallocResult {
+    Alloc alloc;
+    // failed is true if the allocation overflowed memory.
+    bool failed;
+};
+
+// new_alloc synthesizes an Alloc when its offset and size is derived.
+Alloc new_alloc(uint offset, uint size) {
+    Alloc a;
+    a.offset = offset;
+#ifdef MEM_DEBUG
+    a.size = size;
+#endif
+    return a;
+}
+
+// malloc allocates size bytes of memory.
+MallocResult malloc(uint size) {
+    MallocResult r;
+    r.failed = false;
+    uint offset = atomicAdd(mem_offset, size);
+    r.alloc = new_alloc(offset, size);
+    if (offset + size > memory.length() * 4) {
+        r.failed = true;
+        atomicMax(mem_error, ERR_MALLOC_FAILED);
+        return r;
+    }
+#ifdef MEM_DEBUG
+    if ((size & 3) != 0) {
+        r.failed = true;
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return r;
+    }
+#endif
+    return r;
+}
+
+// touch_mem checks whether access to the memory word at offset is valid.
+// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
+// Note that offset is in words.
+bool touch_mem(Alloc alloc, uint offset) {
+#ifdef MEM_DEBUG
+    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
+        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
+        return false;
+    }
+#endif
+    return true;
+}
+
+// write_mem writes val to memory at offset.
+// Note that offset is in words.
+void write_mem(Alloc alloc, uint offset, uint val) {
+    if (!touch_mem(alloc, offset)) {
+        return;
+    }
+    memory[offset] = val;
+}
+
+// read_mem reads the value from memory at offset.
+// Note that offset is in words.
+uint read_mem(Alloc alloc, uint offset) {
+    if (!touch_mem(alloc, offset)) {
+        return 0;
+    }
+    uint v = memory[offset];
+    return v;
+}
+
+// slice_mem returns a sub-allocation inside another. Note that offset and size
+// are in bytes, relative to a.offset.
+Alloc slice_mem(Alloc a, uint offset, uint size) {
+#ifdef MEM_DEBUG
+    if ((offset & 3) != 0 || (size & 3) != 0) {
+        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
+        return Alloc(0, 0);
+    }
+    if (offset + size > a.size) {
+        // slice_mem is sometimes used for slices outside bounds,
+        // but never written.
+        return Alloc(0, 0);
+    }
+#endif
+    return new_alloc(a.offset + offset, size);
+}
diff --git a/gpu/shaders/path_coarse.comp b/gpu/shaders/path_coarse.comp
new file mode 100644
index 00000000..4f77ff9b
--- /dev/null
+++ b/gpu/shaders/path_coarse.comp
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Coarse rasterization of path segments.
+
+// Allocation and initialization of tiles for paths.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_COARSE_WG 5
+#define COARSE_WG (1 << LG_COARSE_WG)
+
+layout(local_size_x = COARSE_WG, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "pathseg.h"
+#include "tile.h"
+
+// scale factors useful for converting coordinates to tiles
+#define SX (1.0 / float(TILE_WIDTH_PX))
+#define SY (1.0 / float(TILE_HEIGHT_PX))
+
+#define ACCURACY 0.25
+#define Q_ACCURACY (ACCURACY * 0.1)
+#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
+#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
+
+vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
+    float mt = 1.0 - t;
+    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
+}
+
+vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
+    float mt = 1.0 - t;
+    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
+}
+
+struct SubdivResult {
+    float val;
+    float a0;
+    float a2;
+};
+
+/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
+///
+/// This is used for flattening curves.
+#define D 0.67
+float approx_parabola_integral(float x) {
+    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
+}
+
+/// An approximation to the inverse parabola integral.
+#define B 0.39
+float approx_parabola_inv_integral(float x) {
+    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
+}
+
+SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
+    vec2 d01 = p1 - p0;
+    vec2 d12 = p2 - p1;
+    vec2 dd = d01 - d12;
+    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
+    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
+    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
+    float scale = abs(cross / (length(dd) * (x2 - x0)));
+
+    float a0 = approx_parabola_integral(x0);
+    float a2 = approx_parabola_integral(x2);
+    float val = 0.0;
+    if (scale < 1e9) {
+        float da = abs(a2 - a0);
+        float sqrt_scale = sqrt(scale);
+        if (sign(x0) == sign(x2)) {
+            val = da * sqrt_scale;
+        } else {
+            float xmin = sqrt_tol / sqrt_scale;
+            val = sqrt_tol * da / approx_parabola_integral(xmin);
+        }
+    }
+    return SubdivResult(val, a0, a2);
+}
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
+
+    uint tag = PathSeg_Nop;
+    if (element_ix < conf.n_pathseg) {
+        tag = PathSeg_tag(conf.pathseg_alloc, ref);
+    }
+    switch (tag) {
+    case PathSeg_FillCubic:
+    case PathSeg_StrokeCubic:
+        PathStrokeCubic cubic = PathSeg_StrokeCubic_read(conf.pathseg_alloc, ref);
+        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
+        float err = err_v.x * err_v.x + err_v.y * err_v.y;
+        // The number of quadratics.
+        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
+        // Iterate over quadratics and tote up the estimated number of segments.
+        float val = 0.0;
+        vec2 qp0 = cubic.p0;
+        float step = 1.0 / float(n_quads);
+        for (uint i = 0; i < n_quads; i++) {
+            float t = float(i + 1) * step;
+            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
+            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            val += params.val;
+
+            qp0 = qp2;
+        }
+        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
+
+        uint path_ix = cubic.path_ix;
+        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
+        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+        ivec4 bbox = ivec4(path.bbox);
+        vec2 p0 = cubic.p0;
+        qp0 = cubic.p0;
+        float v_step = val / float(n);
+        int n_out = 1;
+        float val_sum = 0.0;
+        for (uint i = 0; i < n_quads; i++) {
+            float t = float(i + 1) * step;
+            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
+            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
+            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
+            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
+            float u0 = approx_parabola_inv_integral(params.a0);
+            float u2 = approx_parabola_inv_integral(params.a2);
+            float uscale = 1.0 / (u2 - u0);
+            float target = float(n_out) * v_step;
+            while (n_out == n || target < val_sum + params.val) {
+                vec2 p1;
+                if (n_out == n) {
+                    p1 = cubic.p3;
+                } else {
+                    float u = (target - val_sum) / params.val;
+                    float a = mix(params.a0, params.a2, u);
+                    float au = approx_parabola_inv_integral(a);
+                    float t = (au - u0) * uscale;
+                    p1 = eval_quad(qp0, qp1, qp2, t);
+                }
+
+                // Output line segment
+
+                // Bounding box of element in pixel coordinates.
+                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
+                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
+                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
+                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
+                float dx = p1.x - p0.x;
+                float dy = p1.y - p0.y;
+                // Set up for per-scanline coverage formula, below.
+                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
+                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
+                float b = invslope; // Note: assumes square tiles, otherwise scale.
+                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
+
+                int x0 = int(floor(xmin * SX));
+                int x1 = int(floor(xmax * SX) + 1);
+                int y0 = int(floor(ymin * SY));
+                int y1 = int(floor(ymax * SY) + 1);
+
+                x0 = clamp(x0, bbox.x, bbox.z);
+                y0 = clamp(y0, bbox.y, bbox.w);
+                x1 = clamp(x1, bbox.x, bbox.z);
+                y1 = clamp(y1, bbox.y, bbox.w);
+                float xc = a + b * float(y0);
+                int stride = bbox.z - bbox.x;
+                int base = (y0 - bbox.y) * stride - bbox.x;
+                // TODO: can be tighter, use c to bound width
+                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
+                // Consider using subgroups to aggregate atomic add.
+                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                if (tile_alloc.failed) {
+                    return;
+                }
+                uint tile_offset = tile_alloc.alloc.offset;
+
+                TileSeg tile_seg;
+
+                int xray = int(floor(p0.x*SX));
+                int last_xray = int(floor(p1.x*SX));
+                if (p0.y > p1.y) {
+                    int tmp = xray;
+                    xray = last_xray;
+                    last_xray = tmp;
+                }
+                for (int y = y0; y < y1; y++) {
+                    float tile_y0 = float(y * TILE_HEIGHT_PX);
+                    int xbackdrop = max(xray + 1, bbox.x);
+                    if (tag == PathSeg_FillCubic && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
+                        int backdrop = p1.y < p0.y ? 1 : -1;
+                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
+                        uint tile_el = tile_ref.offset >> 2;
+                        if (touch_mem(path_alloc, tile_el + 1)) {
+                            atomicAdd(memory[tile_el + 1], backdrop);
+                        }
+                    }
+
+                    // next_xray is the xray for the next scanline; the line segment intersects
+                    // all tiles between xray and next_xray.
+                    int next_xray = last_xray;
+                    if (y < y1 - 1) {
+                        float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
+                        float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
+                        next_xray = int(floor(x_edge*SX));
+                    }
+
+                    int min_xray = min(xray, next_xray);
+                    int max_xray = max(xray, next_xray);
+                    int xx0 = min(int(floor(xc - c)), min_xray);
+                    int xx1 = max(int(ceil(xc + c)), max_xray + 1);
+                    xx0 = clamp(xx0, x0, x1);
+                    xx1 = clamp(xx1, x0, x1);
+
+                    for (int x = xx0; x < xx1; x++) {
+                        float tile_x0 = float(x * TILE_WIDTH_PX);
+                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
+                        uint tile_el = tile_ref.offset >> 2;
+                        uint old = 0;
+                        if (touch_mem(path_alloc, tile_el)) {
+                            old = atomicExchange(memory[tile_el], tile_offset);
+                        }
+                        tile_seg.origin = p0;
+                        tile_seg.vector = p1 - p0;
+                        float y_edge = 0.0;
+                        if (tag == PathSeg_FillCubic) {
+                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
+                            if (min(p0.x, p1.x) < tile_x0) {
+                                vec2 p = vec2(tile_x0, y_edge);
+                                if (p0.x > p1.x) {
+                                    tile_seg.vector = p - p0;
+                                } else {
+                                    tile_seg.origin = p;
+                                    tile_seg.vector = p1 - p;
+                                }
+                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
+                                // Nudge zeroes towards the intended sign.
+                                if (tile_seg.vector.x == 0) {
+                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
+                                }
+                            }
+                            if (x <= min_xray || max_xray < x) {
+                                // Reject inconsistent intersections.
+                                y_edge = 1e9;
+                            }
+                        }
+                        tile_seg.y_edge = y_edge;
+                        tile_seg.next.offset = old;
+                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
+                        tile_offset += TileSeg_size;
+                    }
+                    xc += b;
+                    base += stride;
+                    xray = next_xray;
+                }
+
+                n_out += 1;
+                target += v_step;
+                p0 = p1;
+            }
+            val_sum += params.val;
+
+            qp0 = qp2;
+        }
+
+        break;
+    }
+}
diff --git a/gpu/shaders/pathseg.h b/gpu/shaders/pathseg.h
new file mode 100644
index 00000000..00509fbf
--- /dev/null
+++ b/gpu/shaders/pathseg.h
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct PathFillLineRef {
+    uint offset;
+};
+
+struct PathStrokeLineRef {
+    uint offset;
+};
+
+struct PathFillCubicRef {
+    uint offset;
+};
+
+struct PathStrokeCubicRef {
+    uint offset;
+};
+
+struct PathSegRef {
+    uint offset;
+};
+
+struct PathFillLine {
+    vec2 p0;
+    vec2 p1;
+    uint path_ix;
+};
+
+#define PathFillLine_size 20
+
+PathFillLineRef PathFillLine_index(PathFillLineRef ref, uint index) {
+    return PathFillLineRef(ref.offset + index * PathFillLine_size);
+}
+
+struct PathStrokeLine {
+    vec2 p0;
+    vec2 p1;
+    uint path_ix;
+    vec2 stroke;
+};
+
+#define PathStrokeLine_size 28
+
+PathStrokeLineRef PathStrokeLine_index(PathStrokeLineRef ref, uint index) {
+    return PathStrokeLineRef(ref.offset + index * PathStrokeLine_size);
+}
+
+struct PathFillCubic {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+    uint path_ix;
+};
+
+#define PathFillCubic_size 36
+
+PathFillCubicRef PathFillCubic_index(PathFillCubicRef ref, uint index) {
+    return PathFillCubicRef(ref.offset + index * PathFillCubic_size);
+}
+
+struct PathStrokeCubic {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+    uint path_ix;
+    vec2 stroke;
+};
+
+#define PathStrokeCubic_size 44
+
+PathStrokeCubicRef PathStrokeCubic_index(PathStrokeCubicRef ref, uint index) {
+    return PathStrokeCubicRef(ref.offset + index * PathStrokeCubic_size);
+}
+
+#define PathSeg_Nop 0
+#define PathSeg_FillLine 1
+#define PathSeg_StrokeLine 2
+#define PathSeg_FillCubic 3
+#define PathSeg_StrokeCubic 4
+#define PathSeg_size 48
+
+PathSegRef PathSeg_index(PathSegRef ref, uint index) {
+    return PathSegRef(ref.offset + index * PathSeg_size);
+}
+
+PathFillLine PathFillLine_read(Alloc a, PathFillLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    PathFillLine s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.path_ix = raw4;
+    return s;
+}
+
+void PathFillLine_write(Alloc a, PathFillLineRef ref, PathFillLine s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, s.path_ix);
+}
+
+PathStrokeLine PathStrokeLine_read(Alloc a, PathStrokeLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    PathStrokeLine s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.path_ix = raw4;
+    s.stroke = vec2(uintBitsToFloat(raw5), uintBitsToFloat(raw6));
+    return s;
+}
+
+void PathStrokeLine_write(Alloc a, PathStrokeLineRef ref, PathStrokeLine s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, s.path_ix);
+    write_mem(a, ix + 5, floatBitsToUint(s.stroke.x));
+    write_mem(a, ix + 6, floatBitsToUint(s.stroke.y));
+}
+
+PathFillCubic PathFillCubic_read(Alloc a, PathFillCubicRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    PathFillCubic s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    s.path_ix = raw8;
+    return s;
+}
+
+void PathFillCubic_write(Alloc a, PathFillCubicRef ref, PathFillCubic s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
+    write_mem(a, ix + 8, s.path_ix);
+}
+
+PathStrokeCubic PathStrokeCubic_read(Alloc a, PathStrokeCubicRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    uint raw10 = read_mem(a, ix + 10);
+    PathStrokeCubic s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    s.path_ix = raw8;
+    s.stroke = vec2(uintBitsToFloat(raw9), uintBitsToFloat(raw10));
+    return s;
+}
+
+void PathStrokeCubic_write(Alloc a, PathStrokeCubicRef ref, PathStrokeCubic s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
+    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
+    write_mem(a, ix + 8, s.path_ix);
+    write_mem(a, ix + 9, floatBitsToUint(s.stroke.x));
+    write_mem(a, ix + 10, floatBitsToUint(s.stroke.y));
+}
+
+uint PathSeg_tag(Alloc a, PathSegRef ref) {
+    return read_mem(a, ref.offset >> 2);
+}
+
+PathFillLine PathSeg_FillLine_read(Alloc a, PathSegRef ref) {
+    return PathFillLine_read(a, PathFillLineRef(ref.offset + 4));
+}
+
+PathStrokeLine PathSeg_StrokeLine_read(Alloc a, PathSegRef ref) {
+    return PathStrokeLine_read(a, PathStrokeLineRef(ref.offset + 4));
+}
+
+PathFillCubic PathSeg_FillCubic_read(Alloc a, PathSegRef ref) {
+    return PathFillCubic_read(a, PathFillCubicRef(ref.offset + 4));
+}
+
+PathStrokeCubic PathSeg_StrokeCubic_read(Alloc a, PathSegRef ref) {
+    return PathStrokeCubic_read(a, PathStrokeCubicRef(ref.offset + 4));
+}
+
+void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
+    write_mem(a, ref.offset >> 2, PathSeg_Nop);
+}
+
+void PathSeg_FillLine_write(Alloc a, PathSegRef ref, PathFillLine s) {
+    write_mem(a, ref.offset >> 2, PathSeg_FillLine);
+    PathFillLine_write(a, PathFillLineRef(ref.offset + 4), s);
+}
+
+void PathSeg_StrokeLine_write(Alloc a, PathSegRef ref, PathStrokeLine s) {
+    write_mem(a, ref.offset >> 2, PathSeg_StrokeLine);
+    PathStrokeLine_write(a, PathStrokeLineRef(ref.offset + 4), s);
+}
+
+void PathSeg_FillCubic_write(Alloc a, PathSegRef ref, PathFillCubic s) {
+    write_mem(a, ref.offset >> 2, PathSeg_FillCubic);
+    PathFillCubic_write(a, PathFillCubicRef(ref.offset + 4), s);
+}
+
+void PathSeg_StrokeCubic_write(Alloc a, PathSegRef ref, PathStrokeCubic s) {
+    write_mem(a, ref.offset >> 2, PathSeg_StrokeCubic);
+    PathStrokeCubic_write(a, PathStrokeCubicRef(ref.offset + 4), s);
+}
+
diff --git a/gpu/shaders/ptcl.h b/gpu/shaders/ptcl.h
new file mode 100644
index 00000000..28a6d0ad
--- /dev/null
+++ b/gpu/shaders/ptcl.h
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct CmdCircleRef {
+    uint offset;
+};
+
+struct CmdLineRef {
+    uint offset;
+};
+
+struct CmdStrokeRef {
+    uint offset;
+};
+
+struct CmdFillRef {
+    uint offset;
+};
+
+struct CmdFillTextureRef {
+    uint offset;
+};
+
+struct CmdBeginClipRef {
+    uint offset;
+};
+
+struct CmdBeginSolidClipRef {
+    uint offset;
+};
+
+struct CmdEndClipRef {
+    uint offset;
+};
+
+struct CmdSolidRef {
+    uint offset;
+};
+
+struct CmdSolidTextureRef {
+    uint offset;
+};
+
+struct CmdSolidMaskRef {
+    uint offset;
+};
+
+struct CmdJumpRef {
+    uint offset;
+};
+
+struct CmdRef {
+    uint offset;
+};
+
+struct CmdCircle {
+    vec2 center;
+    float radius;
+    uint rgba_color;
+};
+
+#define CmdCircle_size 16
+
+CmdCircleRef CmdCircle_index(CmdCircleRef ref, uint index) {
+    return CmdCircleRef(ref.offset + index * CmdCircle_size);
+}
+
+struct CmdLine {
+    vec2 start;
+    vec2 end;
+};
+
+#define CmdLine_size 16
+
+CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
+    return CmdLineRef(ref.offset + index * CmdLine_size);
+}
+
+struct CmdStroke {
+    uint tile_ref;
+    float half_width;
+    uint rgba_color;
+};
+
+#define CmdStroke_size 12
+
+CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
+    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
+}
+
+struct CmdFill {
+    uint tile_ref;
+    int backdrop;
+    uint rgba_color;
+};
+
+#define CmdFill_size 12
+
+CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
+    return CmdFillRef(ref.offset + index * CmdFill_size);
+}
+
+struct CmdFillTexture {
+    uint tile_ref;
+    int backdrop;
+    vec4 mat;
+    vec2 translate;
+    uvec2 uv_bounds;
+};
+
+#define CmdFillTexture_size 40
+
+CmdFillTextureRef CmdFillTexture_index(CmdFillTextureRef ref, uint index) {
+    return CmdFillTextureRef(ref.offset + index * CmdFillTexture_size);
+}
+
+struct CmdBeginClip {
+    uint tile_ref;
+    int backdrop;
+};
+
+#define CmdBeginClip_size 8
+
+CmdBeginClipRef CmdBeginClip_index(CmdBeginClipRef ref, uint index) {
+    return CmdBeginClipRef(ref.offset + index * CmdBeginClip_size);
+}
+
+struct CmdBeginSolidClip {
+    float alpha;
+};
+
+#define CmdBeginSolidClip_size 4
+
+CmdBeginSolidClipRef CmdBeginSolidClip_index(CmdBeginSolidClipRef ref, uint index) {
+    return CmdBeginSolidClipRef(ref.offset + index * CmdBeginSolidClip_size);
+}
+
+struct CmdEndClip {
+    float alpha;
+};
+
+#define CmdEndClip_size 4
+
+CmdEndClipRef CmdEndClip_index(CmdEndClipRef ref, uint index) {
+    return CmdEndClipRef(ref.offset + index * CmdEndClip_size);
+}
+
+struct CmdSolid {
+    uint rgba_color;
+};
+
+#define CmdSolid_size 4
+
+CmdSolidRef CmdSolid_index(CmdSolidRef ref, uint index) {
+    return CmdSolidRef(ref.offset + index * CmdSolid_size);
+}
+
+struct CmdSolidTexture {
+    vec4 mat;
+    vec2 translate;
+    uvec2 uv_bounds;
+};
+
+#define CmdSolidTexture_size 32
+
+CmdSolidTextureRef CmdSolidTexture_index(CmdSolidTextureRef ref, uint index) {
+    return CmdSolidTextureRef(ref.offset + index * CmdSolidTexture_size);
+}
+
+struct CmdSolidMask {
+    float mask;
+};
+
+#define CmdSolidMask_size 4
+
+CmdSolidMaskRef CmdSolidMask_index(CmdSolidMaskRef ref, uint index) {
+    return CmdSolidMaskRef(ref.offset + index * CmdSolidMask_size);
+}
+
+struct CmdJump {
+    uint new_ref;
+};
+
+#define CmdJump_size 4
+
+CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
+    return CmdJumpRef(ref.offset + index * CmdJump_size);
+}
+
+#define Cmd_End 0
+#define Cmd_Circle 1
+#define Cmd_Line 2
+#define Cmd_Fill 3
+#define Cmd_FillTexture 4
+#define Cmd_BeginClip 5
+#define Cmd_BeginSolidClip 6
+#define Cmd_EndClip 7
+#define Cmd_Stroke 8
+#define Cmd_Solid 9
+#define Cmd_SolidMask 10
+#define Cmd_SolidTexture 11
+#define Cmd_Jump 12
+#define Cmd_size 44
+
+CmdRef Cmd_index(CmdRef ref, uint index) {
+    return CmdRef(ref.offset + index * Cmd_size);
+}
+
+CmdCircle CmdCircle_read(Alloc a, CmdCircleRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    CmdCircle s;
+    s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.radius = uintBitsToFloat(raw2);
+    s.rgba_color = raw3;
+    return s;
+}
+
+void CmdCircle_write(Alloc a, CmdCircleRef ref, CmdCircle s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.center.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.center.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.radius));
+    write_mem(a, ix + 3, s.rgba_color);
+}
+
+CmdLine CmdLine_read(Alloc a, CmdLineRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    CmdLine s;
+    s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+void CmdLine_write(Alloc a, CmdLineRef ref, CmdLine s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.start.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.start.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.end.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.end.y));
+}
+
+CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    CmdStroke s;
+    s.tile_ref = raw0;
+    s.half_width = uintBitsToFloat(raw1);
+    s.rgba_color = raw2;
+    return s;
+}
+
+void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
+    write_mem(a, ix + 2, s.rgba_color);
+}
+
+CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    CmdFill s;
+    s.tile_ref = raw0;
+    s.backdrop = int(raw1);
+    s.rgba_color = raw2;
+    return s;
+}
+
+void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
+    write_mem(a, ix + 2, s.rgba_color);
+}
+
+CmdFillTexture CmdFillTexture_read(Alloc a, CmdFillTextureRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    uint raw8 = read_mem(a, ix + 8);
+    uint raw9 = read_mem(a, ix + 9);
+    CmdFillTexture s;
+    s.tile_ref = raw0;
+    s.backdrop = int(raw1);
+    s.mat = vec4(uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.translate = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    s.uv_bounds = uvec2(raw8, raw9);
+    return s;
+}
+
+void CmdFillTexture_write(Alloc a, CmdFillTextureRef ref, CmdFillTexture s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 5, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 6, floatBitsToUint(s.translate.x));
+    write_mem(a, ix + 7, floatBitsToUint(s.translate.y));
+    write_mem(a, ix + 8, s.uv_bounds.x);
+    write_mem(a, ix + 9, s.uv_bounds.y);
+}
+
+CmdBeginClip CmdBeginClip_read(Alloc a, CmdBeginClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    CmdBeginClip s;
+    s.tile_ref = raw0;
+    s.backdrop = int(raw1);
+    return s;
+}
+
+void CmdBeginClip_write(Alloc a, CmdBeginClipRef ref, CmdBeginClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.tile_ref);
+    write_mem(a, ix + 1, uint(s.backdrop));
+}
+
+CmdBeginSolidClip CmdBeginSolidClip_read(Alloc a, CmdBeginSolidClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdBeginSolidClip s;
+    s.alpha = uintBitsToFloat(raw0);
+    return s;
+}
+
+void CmdBeginSolidClip_write(Alloc a, CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
+}
+
+CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdEndClip s;
+    s.alpha = uintBitsToFloat(raw0);
+    return s;
+}
+
+void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
+}
+
+CmdSolid CmdSolid_read(Alloc a, CmdSolidRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdSolid s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+void CmdSolid_write(Alloc a, CmdSolidRef ref, CmdSolid s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.rgba_color);
+}
+
+CmdSolidTexture CmdSolidTexture_read(Alloc a, CmdSolidTextureRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    uint raw6 = read_mem(a, ix + 6);
+    uint raw7 = read_mem(a, ix + 7);
+    CmdSolidTexture s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.uv_bounds = uvec2(raw6, raw7);
+    return s;
+}
+
+void CmdSolidTexture_write(Alloc a, CmdSolidTextureRef ref, CmdSolidTexture s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.mat.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.mat.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.mat.z));
+    write_mem(a, ix + 3, floatBitsToUint(s.mat.w));
+    write_mem(a, ix + 4, floatBitsToUint(s.translate.x));
+    write_mem(a, ix + 5, floatBitsToUint(s.translate.y));
+    write_mem(a, ix + 6, s.uv_bounds.x);
+    write_mem(a, ix + 7, s.uv_bounds.y);
+}
+
+CmdSolidMask CmdSolidMask_read(Alloc a, CmdSolidMaskRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdSolidMask s;
+    s.mask = uintBitsToFloat(raw0);
+    return s;
+}
+
+void CmdSolidMask_write(Alloc a, CmdSolidMaskRef ref, CmdSolidMask s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.mask));
+}
+
+CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    CmdJump s;
+    s.new_ref = raw0;
+    return s;
+}
+
+void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.new_ref);
+}
+
+uint Cmd_tag(Alloc a, CmdRef ref) {
+    return read_mem(a, ref.offset >> 2);
+}
+
+CmdCircle Cmd_Circle_read(Alloc a, CmdRef ref) {
+    return CmdCircle_read(a, CmdCircleRef(ref.offset + 4));
+}
+
+CmdLine Cmd_Line_read(Alloc a, CmdRef ref) {
+    return CmdLine_read(a, CmdLineRef(ref.offset + 4));
+}
+
+CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
+    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
+}
+
+CmdFillTexture Cmd_FillTexture_read(Alloc a, CmdRef ref) {
+    return CmdFillTexture_read(a, CmdFillTextureRef(ref.offset + 4));
+}
+
+CmdBeginClip Cmd_BeginClip_read(Alloc a, CmdRef ref) {
+    return CmdBeginClip_read(a, CmdBeginClipRef(ref.offset + 4));
+}
+
+CmdBeginSolidClip Cmd_BeginSolidClip_read(Alloc a, CmdRef ref) {
+    return CmdBeginSolidClip_read(a, CmdBeginSolidClipRef(ref.offset + 4));
+}
+
+CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) {
+    return CmdEndClip_read(a, CmdEndClipRef(ref.offset + 4));
+}
+
+CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
+    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
+}
+
+CmdSolid Cmd_Solid_read(Alloc a, CmdRef ref) {
+    return CmdSolid_read(a, CmdSolidRef(ref.offset + 4));
+}
+
+CmdSolidMask Cmd_SolidMask_read(Alloc a, CmdRef ref) {
+    return CmdSolidMask_read(a, CmdSolidMaskRef(ref.offset + 4));
+}
+
+CmdSolidTexture Cmd_SolidTexture_read(Alloc a, CmdRef ref) {
+    return CmdSolidTexture_read(a, CmdSolidTextureRef(ref.offset + 4));
+}
+
+CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
+    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
+}
+
+void Cmd_End_write(Alloc a, CmdRef ref) {
+    write_mem(a, ref.offset >> 2, Cmd_End);
+}
+
+void Cmd_Circle_write(Alloc a, CmdRef ref, CmdCircle s) {
+    write_mem(a, ref.offset >> 2, Cmd_Circle);
+    CmdCircle_write(a, CmdCircleRef(ref.offset + 4), s);
+}
+
+void Cmd_Line_write(Alloc a, CmdRef ref, CmdLine s) {
+    write_mem(a, ref.offset >> 2, Cmd_Line);
+    CmdLine_write(a, CmdLineRef(ref.offset + 4), s);
+}
+
+void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
+    write_mem(a, ref.offset >> 2, Cmd_Fill);
+    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
+}
+
+void Cmd_FillTexture_write(Alloc a, CmdRef ref, CmdFillTexture s) {
+    write_mem(a, ref.offset >> 2, Cmd_FillTexture);
+    CmdFillTexture_write(a, CmdFillTextureRef(ref.offset + 4), s);
+}
+
+void Cmd_BeginClip_write(Alloc a, CmdRef ref, CmdBeginClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
+    CmdBeginClip_write(a, CmdBeginClipRef(ref.offset + 4), s);
+}
+
+void Cmd_BeginSolidClip_write(Alloc a, CmdRef ref, CmdBeginSolidClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_BeginSolidClip);
+    CmdBeginSolidClip_write(a, CmdBeginSolidClipRef(ref.offset + 4), s);
+}
+
+void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) {
+    write_mem(a, ref.offset >> 2, Cmd_EndClip);
+    CmdEndClip_write(a, CmdEndClipRef(ref.offset + 4), s);
+}
+
+void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
+    write_mem(a, ref.offset >> 2, Cmd_Stroke);
+    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
+}
+
+void Cmd_Solid_write(Alloc a, CmdRef ref, CmdSolid s) {
+    write_mem(a, ref.offset >> 2, Cmd_Solid);
+    CmdSolid_write(a, CmdSolidRef(ref.offset + 4), s);
+}
+
+void Cmd_SolidMask_write(Alloc a, CmdRef ref, CmdSolidMask s) {
+    write_mem(a, ref.offset >> 2, Cmd_SolidMask);
+    CmdSolidMask_write(a, CmdSolidMaskRef(ref.offset + 4), s);
+}
+
+void Cmd_SolidTexture_write(Alloc a, CmdRef ref, CmdSolidTexture s) {
+    write_mem(a, ref.offset >> 2, Cmd_SolidTexture);
+    CmdSolidTexture_write(a, CmdSolidTextureRef(ref.offset + 4), s);
+}
+
+void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
+    write_mem(a, ref.offset >> 2, Cmd_Jump);
+    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
+}
+
diff --git a/gpu/shaders/scene.h b/gpu/shaders/scene.h
new file mode 100644
index 00000000..2ecb6e5c
--- /dev/null
+++ b/gpu/shaders/scene.h
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct LineSegRef {
+    uint offset;
+};
+
+struct QuadSegRef {
+    uint offset;
+};
+
+struct CubicSegRef {
+    uint offset;
+};
+
+struct FillRef {
+    uint offset;
+};
+
+struct FillTextureRef {
+    uint offset;
+};
+
+struct StrokeRef {
+    uint offset;
+};
+
+struct SetLineWidthRef {
+    uint offset;
+};
+
+struct TransformRef {
+    uint offset;
+};
+
+struct ClipRef {
+    uint offset;
+};
+
+struct ElementRef {
+    uint offset;
+};
+
+struct LineSeg {
+    vec2 p0;
+    vec2 p1;
+};
+
+#define LineSeg_size 16
+
+LineSegRef LineSeg_index(LineSegRef ref, uint index) {
+    return LineSegRef(ref.offset + index * LineSeg_size);
+}
+
+struct QuadSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+};
+
+#define QuadSeg_size 24
+
+QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
+    return QuadSegRef(ref.offset + index * QuadSeg_size);
+}
+
+struct CubicSeg {
+    vec2 p0;
+    vec2 p1;
+    vec2 p2;
+    vec2 p3;
+};
+
+#define CubicSeg_size 32
+
+CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
+    return CubicSegRef(ref.offset + index * CubicSeg_size);
+}
+
+struct Fill {
+    uint rgba_color;
+};
+
+#define Fill_size 4
+
+FillRef Fill_index(FillRef ref, uint index) {
+    return FillRef(ref.offset + index * Fill_size);
+}
+
+struct FillTexture {
+    uvec2 uv_bounds;
+};
+
+#define FillTexture_size 8
+
+FillTextureRef FillTexture_index(FillTextureRef ref, uint index) {
+    return FillTextureRef(ref.offset + index * FillTexture_size);
+}
+
+struct Stroke {
+    uint rgba_color;
+};
+
+#define Stroke_size 4
+
+StrokeRef Stroke_index(StrokeRef ref, uint index) {
+    return StrokeRef(ref.offset + index * Stroke_size);
+}
+
+struct SetLineWidth {
+    float width;
+};
+
+#define SetLineWidth_size 4
+
+SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
+    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
+}
+
+struct Transform {
+    vec4 mat;
+    vec2 translate;
+};
+
+#define Transform_size 24
+
+TransformRef Transform_index(TransformRef ref, uint index) {
+    return TransformRef(ref.offset + index * Transform_size);
+}
+
+struct Clip {
+    vec4 bbox;
+};
+
+#define Clip_size 16
+
+ClipRef Clip_index(ClipRef ref, uint index) {
+    return ClipRef(ref.offset + index * Clip_size);
+}
+
+#define Element_Nop 0
+#define Element_StrokeLine 1
+#define Element_FillLine 2
+#define Element_StrokeQuad 3
+#define Element_FillQuad 4
+#define Element_StrokeCubic 5
+#define Element_FillCubic 6
+#define Element_Stroke 7
+#define Element_Fill 8
+#define Element_SetLineWidth 9
+#define Element_Transform 10
+#define Element_BeginClip 11
+#define Element_EndClip 12
+#define Element_FillTexture 13
+#define Element_size 36
+
+ElementRef Element_index(ElementRef ref, uint index) {
+    return ElementRef(ref.offset + index * Element_size);
+}
+
+LineSeg LineSeg_read(LineSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    LineSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+QuadSeg QuadSeg_read(QuadSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    QuadSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+CubicSeg CubicSeg_read(CubicSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    uint raw6 = scene[ix + 6];
+    uint raw7 = scene[ix + 7];
+    CubicSeg s;
+    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
+    return s;
+}
+
+Fill Fill_read(FillRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Fill s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+FillTexture FillTexture_read(FillTextureRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    FillTexture s;
+    s.uv_bounds = uvec2(raw0, raw1);
+    return s;
+}
+
+Stroke Stroke_read(StrokeRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    Stroke s;
+    s.rgba_color = raw0;
+    return s;
+}
+
+SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    SetLineWidth s;
+    s.width = uintBitsToFloat(raw0);
+    return s;
+}
+
+Transform Transform_read(TransformRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    uint raw4 = scene[ix + 4];
+    uint raw5 = scene[ix + 5];
+    Transform s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    return s;
+}
+
+Clip Clip_read(ClipRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = scene[ix + 0];
+    uint raw1 = scene[ix + 1];
+    uint raw2 = scene[ix + 2];
+    uint raw3 = scene[ix + 3];
+    Clip s;
+    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    return s;
+}
+
+uint Element_tag(ElementRef ref) {
+    return scene[ref.offset >> 2];
+}
+
+LineSeg Element_StrokeLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+LineSeg Element_FillLine_read(ElementRef ref) {
+    return LineSeg_read(LineSegRef(ref.offset + 4));
+}
+
+QuadSeg Element_StrokeQuad_read(ElementRef ref) {
+    return QuadSeg_read(QuadSegRef(ref.offset + 4));
+}
+
+QuadSeg Element_FillQuad_read(ElementRef ref) {
+    return QuadSeg_read(QuadSegRef(ref.offset + 4));
+}
+
+CubicSeg Element_StrokeCubic_read(ElementRef ref) {
+    return CubicSeg_read(CubicSegRef(ref.offset + 4));
+}
+
+CubicSeg Element_FillCubic_read(ElementRef ref) {
+    return CubicSeg_read(CubicSegRef(ref.offset + 4));
+}
+
+Stroke Element_Stroke_read(ElementRef ref) {
+    return Stroke_read(StrokeRef(ref.offset + 4));
+}
+
+Fill Element_Fill_read(ElementRef ref) {
+    return Fill_read(FillRef(ref.offset + 4));
+}
+
+SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
+    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
+}
+
+Transform Element_Transform_read(ElementRef ref) {
+    return Transform_read(TransformRef(ref.offset + 4));
+}
+
+Clip Element_BeginClip_read(ElementRef ref) {
+    return Clip_read(ClipRef(ref.offset + 4));
+}
+
+Clip Element_EndClip_read(ElementRef ref) {
+    return Clip_read(ClipRef(ref.offset + 4));
+}
+
+FillTexture Element_FillTexture_read(ElementRef ref) {
+    return FillTexture_read(FillTextureRef(ref.offset + 4));
+}
+
diff --git a/gpu/shaders/setup.h b/gpu/shaders/setup.h
new file mode 100644
index 00000000..dc32c40b
--- /dev/null
+++ b/gpu/shaders/setup.h
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Various constants for the sizes of groups and tiles.
+
+// Much of this will be made dynamic in various ways, but for now it's easiest
+// to hardcode and keep all in one place.
+
+// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
+// maximum workgroup size of 128, or 1 for a maximum size of 256.
+#define LG_WG_FACTOR 0
+#define WG_FACTOR (1<<LG_WG_FACTOR)
+
+#define TILE_WIDTH_PX 32
+#define TILE_HEIGHT_PX 32
+
+#define PTCL_INITIAL_ALLOC 1024
+
+// These should probably be renamed and/or reworked. In the binning
+// kernel, they represent the number of bins. Also, the workgroup size
+// of that kernel is equal to the number of bins, but should probably
+// be more flexible (it's 512 in the K&L paper).
+#define N_TILE_X 16
+#define N_TILE_Y (8 * WG_FACTOR)
+#define N_TILE (N_TILE_X * N_TILE_Y)
+#define LG_N_TILE (7 + LG_WG_FACTOR)
+#define N_SLICE (N_TILE / 32)
+
+struct Config {
+    uint n_elements; // paths
+    uint n_pathseg;
+    uint width_in_tiles;
+    uint height_in_tiles;
+    Alloc tile_alloc;
+    Alloc bin_alloc;
+    Alloc ptcl_alloc;
+    Alloc pathseg_alloc;
+    Alloc anno_alloc;
+};
diff --git a/gpu/shaders/state.h b/gpu/shaders/state.h
new file mode 100644
index 00000000..8479dcf2
--- /dev/null
+++ b/gpu/shaders/state.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct StateRef {
+    uint offset;
+};
+
+struct State {
+    vec4 mat;
+    vec2 translate;
+    vec4 bbox;
+    float linewidth;
+    uint flags;
+    uint path_count;
+    uint pathseg_count;
+};
+
+#define State_size 56
+
+StateRef State_index(StateRef ref, uint index) {
+    return StateRef(ref.offset + index * State_size);
+}
+
+State State_read(StateRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = state[ix + 0];
+    uint raw1 = state[ix + 1];
+    uint raw2 = state[ix + 2];
+    uint raw3 = state[ix + 3];
+    uint raw4 = state[ix + 4];
+    uint raw5 = state[ix + 5];
+    uint raw6 = state[ix + 6];
+    uint raw7 = state[ix + 7];
+    uint raw8 = state[ix + 8];
+    uint raw9 = state[ix + 9];
+    uint raw10 = state[ix + 10];
+    uint raw11 = state[ix + 11];
+    uint raw12 = state[ix + 12];
+    uint raw13 = state[ix + 13];
+    State s;
+    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
+    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
+    s.linewidth = uintBitsToFloat(raw10);
+    s.flags = raw11;
+    s.path_count = raw12;
+    s.pathseg_count = raw13;
+    return s;
+}
+
+void State_write(StateRef ref, State s) {
+    uint ix = ref.offset >> 2;
+    state[ix + 0] = floatBitsToUint(s.mat.x);
+    state[ix + 1] = floatBitsToUint(s.mat.y);
+    state[ix + 2] = floatBitsToUint(s.mat.z);
+    state[ix + 3] = floatBitsToUint(s.mat.w);
+    state[ix + 4] = floatBitsToUint(s.translate.x);
+    state[ix + 5] = floatBitsToUint(s.translate.y);
+    state[ix + 6] = floatBitsToUint(s.bbox.x);
+    state[ix + 7] = floatBitsToUint(s.bbox.y);
+    state[ix + 8] = floatBitsToUint(s.bbox.z);
+    state[ix + 9] = floatBitsToUint(s.bbox.w);
+    state[ix + 10] = floatBitsToUint(s.linewidth);
+    state[ix + 11] = s.flags;
+    state[ix + 12] = s.path_count;
+    state[ix + 13] = s.pathseg_count;
+}
+
diff --git a/gpu/shaders/tile.h b/gpu/shaders/tile.h
new file mode 100644
index 00000000..500277be
--- /dev/null
+++ b/gpu/shaders/tile.h
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Code auto-generated by piet-gpu-derive
+
+struct PathRef {
+    uint offset;
+};
+
+struct TileRef {
+    uint offset;
+};
+
+struct TileSegRef {
+    uint offset;
+};
+
+struct Path {
+    uvec4 bbox;
+    TileRef tiles;
+};
+
+#define Path_size 12
+
+PathRef Path_index(PathRef ref, uint index) {
+    return PathRef(ref.offset + index * Path_size);
+}
+
+struct Tile {
+    TileSegRef tile;
+    int backdrop;
+};
+
+#define Tile_size 8
+
+TileRef Tile_index(TileRef ref, uint index) {
+    return TileRef(ref.offset + index * Tile_size);
+}
+
+struct TileSeg {
+    vec2 origin;
+    vec2 vector;
+    float y_edge;
+    TileSegRef next;
+};
+
+#define TileSeg_size 24
+
+TileSegRef TileSeg_index(TileSegRef ref, uint index) {
+    return TileSegRef(ref.offset + index * TileSeg_size);
+}
+
+Path Path_read(Alloc a, PathRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    Path s;
+    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
+    s.tiles = TileRef(raw2);
+    return s;
+}
+
+void Path_write(Alloc a, PathRef ref, Path s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
+    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
+    write_mem(a, ix + 2, s.tiles.offset);
+}
+
+Tile Tile_read(Alloc a, TileRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    Tile s;
+    s.tile = TileSegRef(raw0);
+    s.backdrop = int(raw1);
+    return s;
+}
+
+void Tile_write(Alloc a, TileRef ref, Tile s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, s.tile.offset);
+    write_mem(a, ix + 1, uint(s.backdrop));
+}
+
+TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
+    uint ix = ref.offset >> 2;
+    uint raw0 = read_mem(a, ix + 0);
+    uint raw1 = read_mem(a, ix + 1);
+    uint raw2 = read_mem(a, ix + 2);
+    uint raw3 = read_mem(a, ix + 3);
+    uint raw4 = read_mem(a, ix + 4);
+    uint raw5 = read_mem(a, ix + 5);
+    TileSeg s;
+    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
+    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
+    s.y_edge = uintBitsToFloat(raw4);
+    s.next = TileSegRef(raw5);
+    return s;
+}
+
+void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
+    uint ix = ref.offset >> 2;
+    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
+    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
+    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
+    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
+    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
+    write_mem(a, ix + 5, s.next.offset);
+}
+
diff --git a/gpu/shaders/tile_alloc.comp b/gpu/shaders/tile_alloc.comp
new file mode 100644
index 00000000..ac72fb35
--- /dev/null
+++ b/gpu/shaders/tile_alloc.comp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Allocation and initialization of tiles for paths.
+
+#version 450
+#extension GL_GOOGLE_include_directive : enable
+
+#include "mem.h"
+#include "setup.h"
+
+#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
+#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
+
+layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
+
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+#include "annotated.h"
+#include "tile.h"
+
+// scale factors useful for converting coordinates to tiles
+#define SX (1.0 / float(TILE_WIDTH_PX))
+#define SY (1.0 / float(TILE_HEIGHT_PX))
+
+shared uint sh_tile_count[TILE_ALLOC_WG];
+shared MallocResult sh_tile_alloc;
+
+void main() {
+    if (mem_error != NO_ERROR) {
+        return;
+    }
+
+    uint th_ix = gl_LocalInvocationID.x;
+    uint element_ix = gl_GlobalInvocationID.x;
+    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
+
+    uint tag = Annotated_Nop;
+    if (element_ix < conf.n_elements) {
+        tag = Annotated_tag(conf.anno_alloc, ref);
+    }
+    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
+    switch (tag) {
+    case Annotated_Fill:
+    case Annotated_FillTexture:
+    case Annotated_Stroke:
+    case Annotated_BeginClip:
+    case Annotated_EndClip:
+        // Note: we take advantage of the fact that fills, strokes, and
+        // clips have compatible layout.
+        AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
+        x0 = int(floor(fill.bbox.x * SX));
+        y0 = int(floor(fill.bbox.y * SY));
+        x1 = int(ceil(fill.bbox.z * SX));
+        y1 = int(ceil(fill.bbox.w * SY));
+        break;
+    }
+    x0 = clamp(x0, 0, int(conf.width_in_tiles));
+    y0 = clamp(y0, 0, int(conf.height_in_tiles));
+    x1 = clamp(x1, 0, int(conf.width_in_tiles));
+    y1 = clamp(y1, 0, int(conf.height_in_tiles));
+
+    Path path;
+    path.bbox = uvec4(x0, y0, x1, y1);
+    uint tile_count = (x1 - x0) * (y1 - y0);
+    if (tag == Annotated_EndClip) {
+        // Don't actually allocate tiles for an end clip, but we do want
+        // the path structure (especially bbox) allocated for it.
+        tile_count = 0;
+    }
+
+    sh_tile_count[th_ix] = tile_count;
+    uint total_tile_count = tile_count;
+    // Prefix sum of sh_tile_count
+    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
+        barrier();
+        if (th_ix >= (1 << i)) {
+            total_tile_count += sh_tile_count[th_ix - (1 << i)];
+        }
+        barrier();
+        sh_tile_count[th_ix] = total_tile_count;
+    }
+    if (th_ix == TILE_ALLOC_WG - 1) {
+        sh_tile_alloc = malloc(total_tile_count * Tile_size);
+    }
+    barrier();
+    MallocResult alloc_start = sh_tile_alloc;
+    if (alloc_start.failed) {
+        return;
+    }
+
+    if (element_ix < conf.n_elements) {
+        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
+        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
+        path.tiles = TileRef(tiles_alloc.offset);
+        Path_write(conf.tile_alloc, path_ref, path);
+    }
+
+    // Zero out allocated tiles efficiently
+    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
+    uint start_ix = alloc_start.alloc.offset >> 2;
+    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
+        // Note: this interleaving is faster than using Tile_write
+        // by a significant amount.
+        write_mem(alloc_start.alloc, start_ix + i, 0);
+    }
+}