all: switch to external shaders in the gioui.org/shaders module

Signed-off-by: Elias Naur <mail@eliasnaur.com>
2021-08-02 17:46:40 +02:00
parent 18b4442393
commit 6aee543234
50 changed files with 112 additions and 11502 deletions
@@ -8,4 +8,7 @@ require (
 	golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c
 )

-require gioui.org/cpu v0.0.0-20210727122813-41509bcd3462
+require (
+	gioui.org/cpu v0.0.0-20210808092351-bfe733dd3334
+	gioui.org/shader v0.0.0-20210808092941-55e18336189e
+)
@@ -1,8 +1,10 @@
 cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
-gioui.org/cpu v0.0.0-20210727122813-41509bcd3462 h1:JZyB+d8tPExZHNZwMiGKeeAVd0mkFTc3Zsmegdn178M=
-gioui.org/cpu v0.0.0-20210727122813-41509bcd3462/go.mod h1:DkhBDuHokSMOUxX5LZQ7IcxyJJzs3OON8Z5ojaXUXxo=
+gioui.org/cpu v0.0.0-20210808092351-bfe733dd3334 h1:1xK224B5DnjlPKCfVDTl7+olrzgAXn4ym6dum3l34rs=
+gioui.org/cpu v0.0.0-20210808092351-bfe733dd3334/go.mod h1:A8M0Cn5o+vY5LTMlnRoK3O5kG+rH0kWfJjeKd9QpBmQ=
+gioui.org/shader v0.0.0-20210808092941-55e18336189e h1:JD4FUQ/appkr/58YHvdKfvHT6BHiGJ2yUDBEAnq0Ugw=
+gioui.org/shader v0.0.0-20210808092941-55e18336189e/go.mod h1:mWdiME581d/kV7/iEhLmUgUK5iZ09XR5XpduXzbePVM=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0=
@@ -19,6 +19,7 @@ import (
 	"time"
 	"unsafe"

+	"gioui.org/cpu"
 	"gioui.org/f32"
 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/byteslice"
@@ -29,9 +30,9 @@ import (
 	"gioui.org/layout"
 	"gioui.org/op"
 	"gioui.org/op/clip"
-
-	"gioui.org/cpu"
-	"gioui.org/cpu/piet"
+	"gioui.org/shader"
+	"gioui.org/shader/gio"
+	"gioui.org/shader/piet"
 )

 type compute struct {
@@ -390,29 +391,22 @@ func newCompute(ctx driver.Device) (*compute, error) {
 	}
 	shaders := []struct {
 		prog *computeProgram
-		src  driver.ShaderSources
+		src  shader.Sources
 		info *cpu.ProgramInfo
-		hash string
 	}{
-		{&g.programs.elements, shader_elements_comp, piet.ElementsProgramInfo, piet.ElementsHash},
-		{&g.programs.tileAlloc, shader_tile_alloc_comp, piet.Tile_allocProgramInfo, piet.Tile_allocHash},
-		{&g.programs.pathCoarse, shader_path_coarse_comp, piet.Path_coarseProgramInfo, piet.Path_coarseHash},
-		{&g.programs.backdrop, shader_backdrop_comp, piet.BackdropProgramInfo, piet.BackdropHash},
-		{&g.programs.binning, shader_binning_comp, piet.BinningProgramInfo, piet.BinningHash},
-		{&g.programs.coarse, shader_coarse_comp, piet.CoarseProgramInfo, piet.CoarseHash},
-		{&g.programs.kernel4, shader_kernel4_comp, piet.Kernel4ProgramInfo, piet.Kernel4Hash},
+		{&g.programs.elements, piet.Shader_elements_comp, piet.ElementsProgramInfo},
+		{&g.programs.tileAlloc, piet.Shader_tile_alloc_comp, piet.Tile_allocProgramInfo},
+		{&g.programs.pathCoarse, piet.Shader_path_coarse_comp, piet.Path_coarseProgramInfo},
+		{&g.programs.backdrop, piet.Shader_backdrop_comp, piet.BackdropProgramInfo},
+		{&g.programs.binning, piet.Shader_binning_comp, piet.BinningProgramInfo},
+		{&g.programs.coarse, piet.Shader_coarse_comp, piet.CoarseProgramInfo},
+		{&g.programs.kernel4, piet.Shader_kernel4_comp, piet.Kernel4ProgramInfo},
 	}
 	if !caps.Features.Has(driver.FeatureCompute) {
-		g.useCPU = supportsCPUCompute
-		for _, s := range shaders {
-			if s.src.Hash != s.hash {
-				g.useCPU = false
-				break
-			}
-		}
-		if !g.useCPU {
+		if !supportsCPUCompute {
 			return nil, errors.New("gpu: missing support for compute programs")
 		}
+		g.useCPU = true
 	}
 	if g.useCPU {
 		g.dispatcher = newDispatcher(runtime.NumCPU())
@@ -420,15 +414,15 @@ func newCompute(ctx driver.Device) (*compute, error) {

 	// Large enough for reasonable fill sizes, yet still spannable by the compute programs.
 	g.output.packer.maxDim = 4096
-	blitProg, err := ctx.NewProgram(shader_copy_vert, shader_copy_frag)
+	blitProg, err := ctx.NewProgram(gio.Shader_copy_vert, gio.Shader_copy_frag)
 	if err != nil {
 		g.Release()
 		return nil, err
 	}
 	g.output.blitProg = blitProg
-	progLayout, err := ctx.NewInputLayout(shader_copy_vert, []driver.InputDesc{
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 0},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 4 * 2},
+	progLayout, err := ctx.NewInputLayout(gio.Shader_copy_vert, []shader.InputDesc{
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 0},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 4 * 2},
 	})
 	if err != nil {
 		g.Release()
@@ -445,15 +439,15 @@ func newCompute(ctx driver.Device) (*compute, error) {
 	g.output.uniBuf = buf
 	g.output.blitProg.SetVertexUniforms(buf)

-	materialProg, err := ctx.NewProgram(shader_material_vert, shader_material_frag)
+	materialProg, err := ctx.NewProgram(gio.Shader_material_vert, gio.Shader_material_frag)
 	if err != nil {
 		g.Release()
 		return nil, err
 	}
 	g.materials.prog = materialProg
-	progLayout, err = ctx.NewInputLayout(shader_material_vert, []driver.InputDesc{
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 0},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 4 * 2},
+	progLayout, err = ctx.NewInputLayout(gio.Shader_material_vert, []shader.InputDesc{
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 0},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 4 * 2},
 	})
 	if err != nil {
 		g.Release()
@@ -1,5 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package gpu
-
-//go:generate go run ./internal/convertshaders -package gpu
@@ -9,12 +9,14 @@ package gpu

 import (
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"image"
 	"image/color"
 	"math"
 	"os"
 	"reflect"
+	"runtime/debug"
 	"time"
 	"unsafe"

@@ -29,6 +31,8 @@ import (
 	"gioui.org/layout"
 	"gioui.org/op"
 	"gioui.org/op/clip"
+	"gioui.org/shader"
+	"gioui.org/shader/gio"

 	// Register backends.
 	_ "gioui.org/gpu/internal/d3d11"
@@ -129,6 +133,10 @@ type imageOp struct {
 	place    placement
 }

+// shaderModuleVersion is the exact version of gioui.org/shader expected by
+// this package. Shader programs are not backwards or forwards compatible.
+const shaderModuleVersion = "v0.0.0-20210808092941-55e18336189e"
+
 func decodeStrokeOp(data []byte) clip.StrokeStyle {
 	_ = data[4]
 	if opconst.OpType(data[0]) != opconst.TypeStroke {
@@ -350,6 +358,9 @@ const (
 )

 func New(api API) (GPU, error) {
+	if err := verifyShaderModule(); err != nil {
+		return nil, err
+	}
 	d, err := driver.NewDevice(api)
 	if err != nil {
 		return nil, err
@@ -376,6 +387,23 @@ func newGPU(ctx driver.Device) (*gpu, error) {
 	return g, nil
 }

+func verifyShaderModule() error {
+	mod, ok := debug.ReadBuildInfo()
+	if !ok {
+		// No module support; hopefully the version matches.
+		return nil
+	}
+	for _, m := range mod.Deps {
+		if m.Path == "gioui.org/shader" {
+			if got := m.Version; got != shaderModuleVersion {
+				return fmt.Errorf("gpu: module gioui.org/shader is version %q, expected %q", got, shaderModuleVersion)
+			}
+			return nil
+		}
+	}
+	return errors.New("gpu: module version for gioui.org/shader not found")
+}
+
 func (g *gpu) init(ctx driver.Device) error {
 	g.ctx = ctx
 	g.renderer = newRenderer(ctx)
@@ -530,7 +558,7 @@ func newBlitter(ctx driver.Device) *blitter {
 	b.colUniforms = new(blitColUniforms)
 	b.texUniforms = new(blitTexUniforms)
 	b.linearGradientUniforms = new(blitLinearGradientUniforms)
-	prog, layout, err := createColorPrograms(ctx, shader_blit_vert, shader_blit_frag,
+	prog, layout, err := createColorPrograms(ctx, gio.Shader_blit_vert, gio.Shader_blit_frag,
 		[3]interface{}{&b.colUniforms.vert, &b.linearGradientUniforms.vert, &b.texUniforms.vert},
 		[3]interface{}{&b.colUniforms.frag, &b.linearGradientUniforms.frag, nil},
 	)
@@ -550,7 +578,7 @@ func (b *blitter) release() {
 	b.layout.Release()
 }

-func createColorPrograms(b driver.Device, vsSrc driver.ShaderSources, fsSrc [3]driver.ShaderSources, vertUniforms, fragUniforms [3]interface{}) ([3]*program, driver.InputLayout, error) {
+func createColorPrograms(b driver.Device, vsSrc shader.Sources, fsSrc [3]shader.Sources, vertUniforms, fragUniforms [3]interface{}) ([3]*program, driver.InputLayout, error) {
 	var progs [3]*program
 	{
 		prog, err := b.NewProgram(vsSrc, fsSrc[materialTexture])
@@ -603,9 +631,9 @@ func createColorPrograms(b driver.Device, vsSrc driver.ShaderSources, fsSrc [3]d
 		}
 		progs[materialLinearGradient] = newProgram(prog, vertBuffer, fragBuffer)
 	}
-	layout, err := b.NewInputLayout(vsSrc, []driver.InputDesc{
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 0},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 4 * 2},
+	layout, err := b.NewInputLayout(vsSrc, []shader.InputDesc{
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 0},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 4 * 2},
 	})
 	if err != nil {
 		progs[materialTexture].Release()
@@ -15,6 +15,8 @@ import (
 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/byteslice"
 	"gioui.org/internal/f32color"
+	"gioui.org/shader"
+	"gioui.org/shader/gio"
 )

 var dumpImages = flag.Bool("saveimages", false, "save test images")
@@ -36,7 +38,7 @@ func TestSimpleShader(t *testing.T) {
 	b := newDriver(t)
 	sz := image.Point{X: 800, Y: 600}
 	fbo := setupFBO(t, b, sz)
-	p, err := b.NewProgram(shader_simple_vert, shader_simple_frag)
+	p, err := b.NewProgram(gio.Shader_simple_vert, gio.Shader_simple_frag)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -59,7 +61,7 @@ func TestInputShader(t *testing.T) {
 	b := newDriver(t)
 	sz := image.Point{X: 800, Y: 600}
 	fbo := setupFBO(t, b, sz)
-	p, err := b.NewProgram(shader_input_vert, shader_simple_frag)
+	p, err := b.NewProgram(gio.Shader_input_vert, gio.Shader_simple_frag)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -77,9 +79,9 @@ func TestInputShader(t *testing.T) {
 	}
 	defer buf.Release()
 	b.BindVertexBuffer(buf, 4*4, 0)
-	layout, err := b.NewInputLayout(shader_input_vert, []driver.InputDesc{
+	layout, err := b.NewInputLayout(gio.Shader_input_vert, []shader.InputDesc{
 		{
-			Type:   driver.DataTypeFloat,
+			Type:   shader.DataTypeFloat,
 			Size:   4,
 			Offset: 0,
 		},
@@ -1,5 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package headless
-
-//go:generate go run ../internal/convertshaders -package headless
@@ -1,233 +0,0 @@
-// Code generated by build.go. DO NOT EDIT.
-
-package headless
-
-import "gioui.org/gpu/internal/driver"
-
-var (
-	shader_input_vert = driver.ShaderSources{
-		Name:   "input.vert",
-		Inputs: []driver.InputLocation{{Name: "position", Location: 0, Semantic: "TEXCOORD", SemanticIndex: 0, Type: 0x0, Size: 4}},
-		GLSL100ES: `#version 100
-
-attribute vec4 position;
-
-void main()
-{
-    gl_Position = position;
-}
-
-`,
-		GLSL300ES: `#version 300 es
-
-layout(location = 0) in vec4 position;
-
-void main()
-{
-    gl_Position = position;
-}
-
-`,
-		GLSL130: `#version 130
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-in vec4 position;
-
-void main()
-{
-    gl_Position = position;
-}
-
-`,
-		GLSL150: `#version 150
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-in vec4 position;
-
-void main()
-{
-    gl_Position = position;
-}
-
-`,
-		HLSL: "DXBC\x1e»\x11\xd3iX7\xd4F\xb9\xa4\xf4R\xf9J\x01\x00\x00\x00\x10\x02\x00\x00\x06\x00\x00\x008\x00\x00\x00\x9c\x00\x00\x00\xe0\x00\x00\x00\\\x01\x00\x00\xa8\x01\x00\x00\xdc\x01\x00\x00Aon9\\\x00\x00\x00\\\x00\x00\x00\x00\x02\xfe\xff4\x00\x00\x00(\x00\x00\x00\x00\x00$\x00\x00\x00$\x00\x00\x00$\x00\x00\x00$\x00\x01\x00$\x00\x00\x00\x00\x00\x00\x02\xfe\xff\x1f\x00\x00\x02\x05\x00\x00\x80\x00\x00\x0f\x90\x04\x00\x00\x04\x00\x00\x03\xc0\x00\x00\xff\x90\x00\x00\xe4\xa0\x00\x00\xe4\x90\x01\x00\x00\x02\x00\x00\f\xc0\x00\x00\xe4\x90\xff\xff\x00\x00SHDR<\x00\x00\x00@\x00\x01\x00\x0f\x00\x00\x00_\x00\x00\x03\xf2\x10\x10\x00\x00\x00\x00\x00g\x00\x00\x04\xf2 \x10\x00\x00\x00\x00\x00\x01\x00\x00\x006\x00\x00\x05\xf2 \x10\x00\x00\x00\x00\x00F\x1e\x10\x00\x00\x00\x00\x00>\x00\x00\x01STATt\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00RDEFD\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x04\xfe\xff\x00\x01\x00\x00\x1c\x00\x00\x00Microsoft (R) HLSL Shader Compiler 10.1\x00ISGN,\x00\x00\x00\x01\x00\x00\x00\b\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x0f\x00\x00TEXCOORD\x00\xab\xab\xabOSGN,\x00\x00\x00\x01\x00\x00\x00\b\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00SV_Position\x00",
-	}
-	shader_simple_frag = driver.ShaderSources{
-		Name: "simple.frag",
-		GLSL100ES: `#version 100
-precision mediump float;
-precision highp int;
-
-void main()
-{
-    gl_FragData[0] = vec4(0.25, 0.550000011920928955078125, 0.75, 1.0);
-}
-
-`,
-		GLSL300ES: `#version 300 es
-precision mediump float;
-precision highp int;
-
-layout(location = 0) out vec4 fragColor;
-
-void main()
-{
-    fragColor = vec4(0.25, 0.550000011920928955078125, 0.75, 1.0);
-}
-
-`,
-		GLSL130: `#version 130
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-out vec4 fragColor;
-
-void main()
-{
-    fragColor = vec4(0.25, 0.550000011920928955078125, 0.75, 1.0);
-}
-
-`,
-		GLSL150: `#version 150
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-out vec4 fragColor;
-
-void main()
-{
-    fragColor = vec4(0.25, 0.550000011920928955078125, 0.75, 1.0);
-}
-
-`,
-		HLSL: "DXBC\xf5F\xdef$)\xa8\xbbV\xeas\xb5ks\x12r\x01\x00\x00\x00\xdc\x01\x00\x00\x06\x00\x00\x008\x00\x00\x00\x90\x00\x00\x00\xd0\x00\x00\x00L\x01\x00\x00\x98\x01\x00\x00\xa8\x01\x00\x00Aon9P\x00\x00\x00P\x00\x00\x00\x00\x02\xff\xff,\x00\x00\x00$\x00\x00\x00\x00\x00$\x00\x00\x00$\x00\x00\x00$\x00\x00\x00$\x00\x00\x00$\x00\x00\x02\xff\xffQ\x00\x00\x05\x00\x00\x0f\xa0\x00\x00\x80>\xcd\xcc\f?\x00\x00@?\x00\x00\x80?\x01\x00\x00\x02\x00\b\x0f\x80\x00\x00\xe4\xa0\xff\xff\x00\x00SHDR8\x00\x00\x00@\x00\x00\x00\x0e\x00\x00\x00e\x00\x00\x03\xf2 \x10\x00\x00\x00\x00\x006\x00\x00\b\xf2 \x10\x00\x00\x00\x00\x00\x02@\x00\x00\x00\x00\x80>\xcd\xcc\f?\x00\x00@?\x00\x00\x80?>\x00\x00\x01STATt\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00RDEFD\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x04\xff\xff\x00\x01\x00\x00\x1c\x00\x00\x00Microsoft (R) HLSL Shader Compiler 10.1\x00ISGN\b\x00\x00\x00\x00\x00\x00\x00\b\x00\x00\x00OSGN,\x00\x00\x00\x01\x00\x00\x00\b\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00SV_Target\x00\xab\xab",
-	}
-	shader_simple_vert = driver.ShaderSources{
-		Name: "simple.vert",
-		GLSL100ES: `#version 100
-
-void main()
-{
-    float x;
-    float y;
-    if (gl_VertexID == 0)
-    {
-        x = 0.0;
-        y = 0.5;
-    }
-    else
-    {
-        if (gl_VertexID == 1)
-        {
-            x = 0.5;
-            y = -0.5;
-        }
-        else
-        {
-            x = -0.5;
-            y = -0.5;
-        }
-    }
-    gl_Position = vec4(x, y, 0.5, 1.0);
-}
-
-`,
-		GLSL300ES: `#version 300 es
-
-void main()
-{
-    float x;
-    float y;
-    if (gl_VertexID == 0)
-    {
-        x = 0.0;
-        y = 0.5;
-    }
-    else
-    {
-        if (gl_VertexID == 1)
-        {
-            x = 0.5;
-            y = -0.5;
-        }
-        else
-        {
-            x = -0.5;
-            y = -0.5;
-        }
-    }
-    gl_Position = vec4(x, y, 0.5, 1.0);
-}
-
-`,
-		GLSL130: `#version 130
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-void main()
-{
-    float x;
-    float y;
-    if (gl_VertexID == 0)
-    {
-        x = 0.0;
-        y = 0.5;
-    }
-    else
-    {
-        if (gl_VertexID == 1)
-        {
-            x = 0.5;
-            y = -0.5;
-        }
-        else
-        {
-            x = -0.5;
-            y = -0.5;
-        }
-    }
-    gl_Position = vec4(x, y, 0.5, 1.0);
-}
-
-`,
-		GLSL150: `#version 150
-#ifdef GL_ARB_shading_language_420pack
-#extension GL_ARB_shading_language_420pack : require
-#endif
-
-void main()
-{
-    float x;
-    float y;
-    if (gl_VertexID == 0)
-    {
-        x = 0.0;
-        y = 0.5;
-    }
-    else
-    {
-        if (gl_VertexID == 1)
-        {
-            x = 0.5;
-            y = -0.5;
-        }
-        else
-        {
-            x = -0.5;
-            y = -0.5;
-        }
-    }
-    gl_Position = vec4(x, y, 0.5, 1.0);
-}
-
-`,
-		HLSL: "DXBC\xc8 \\\"\xec\xe9\xb2)@\xdf|Z(\xea\f\xb8\x01\x00\x00\x00H\x02\x00\x00\x05\x00\x00\x004\x00\x00\x00\x80\x00\x00\x00\xb4\x00\x00\x00\xe8\x00\x00\x00\xcc\x01\x00\x00RDEFD\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c\x00\x00\x00\x00\x04\xfe\xff\x00\x01\x00\x00\x1c\x00\x00\x00Microsoft (R) HLSL Shader Compiler 10.1\x00ISGN,\x00\x00\x00\x01\x00\x00\x00\b\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x01\x00\x00SV_VertexID\x00OSGN,\x00\x00\x00\x01\x00\x00\x00\b\x00\x00\x00 \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00SV_Position\x00SHDR\xdc\x00\x00\x00@\x00\x01\x007\x00\x00\x00`\x00\x00\x04\x12\x10\x10\x00\x00\x00\x00\x00\x06\x00\x00\x00g\x00\x00\x04\xf2 \x10\x00\x00\x00\x00\x00\x01\x00\x00\x00h\x00\x00\x02\x01\x00\x00\x00 \x00\x00\a\x12\x00\x10\x00\x00\x00\x00\x00\n\x10\x10\x00\x00\x00\x00\x00\x01@\x00\x00\x01\x00\x00\x007\x00\x00\x0f2\x00\x10\x00\x00\x00\x00\x00\x06\x00\x10\x00\x00\x00\x00\x00\x02@\x00\x00\x00\x00\x00?\x00\x00\x00\xbf\x00\x00\x00\x00\x00\x00\x00\x00\x02@\x00\x00\x00\x00\x00\xbf\x00\x00\x00\xbf\x00\x00\x00\x00\x00\x00\x00\x007\x00\x00\f2 \x10\x00\x00\x00\x00\x00\x06\x10\x10\x00\x00\x00\x00\x00F\x00\x10\x00\x00\x00\x00\x00\x02@\x00\x00\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x00\x00\x00\x00\x00\x006\x00\x00\b\xc2 \x10\x00\x00\x00\x00\x00\x02@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00?\x00\x00\x80?>\x00\x00\x01STATt\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
-	}
-)
@@ -1,11 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision highp float;
-
-layout(location=0) in vec4 position;
-
-void main() {
-	gl_Position = position;
-}
@@ -1,11 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-layout(location = 0) out vec4 fragColor;
-
-void main() {
-	fragColor = vec4(.25, .55, .75, 1.0);
-}
@@ -1,20 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision highp float;
-
-void main() {
-	float x, y;
-	if (gl_VertexIndex == 0) {
-		x = 0.0;
-		y = .5;
-	} else if (gl_VertexIndex == 1) {
-		x = .5;
-		y = -.5;
-	} else {
-		x = -.5;
-		y = -.5;
-	}
-	gl_Position = vec4(x, y, 0.5, 1.0);
-}
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package main
-
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"os/exec"
-	"path/filepath"
-)
-
-// GLSLValidator is OpenGL reference compiler.
-type GLSLValidator struct {
-	Bin     string
-	WorkDir WorkDir
-}
-
-func NewGLSLValidator() *GLSLValidator { return &GLSLValidator{Bin: "glslangValidator"} }
-
-// Convert converts a glsl shader to spirv.
-func (glsl *GLSLValidator) Convert(path, variant string, hlsl bool, input []byte) ([]byte, error) {
-	base := glsl.WorkDir.Path(filepath.Base(path), variant)
-	pathout := base + ".out"
-
-	cmd := exec.Command(glsl.Bin,
-		"--stdin",
-		"-I"+filepath.Dir(path),
-		"-V", // OpenGL ES 3.1.
-		"-w", // Suppress warnings.
-		"-S", filepath.Ext(path)[1:],
-		"-o", pathout,
-	)
-	if hlsl {
-		cmd.Args = append(cmd.Args, "-DHLSL")
-	}
-	cmd.Stdin = bytes.NewBuffer(input)
-
-	out, err := cmd.Output()
-	if err != nil {
-		return nil, fmt.Errorf("%s\nfailed to run %v: %w", out, cmd.Args, err)
-	}
-
-	compiled, err := ioutil.ReadFile(pathout)
-	if err != nil {
-		return nil, fmt.Errorf("unable to read output %q: %w", pathout, err)
-	}
-
-	return compiled, nil
-}
@@ -1,146 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package main
-
-import (
-	"bytes"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-	"strings"
-)
-
-// FXC is hlsl compiler that targets ShaderModel 5.x and lower.
-type FXC struct {
-	Bin     string
-	WorkDir WorkDir
-}
-
-func NewFXC() *FXC { return &FXC{Bin: "fxc.exe"} }
-
-// Compile compiles the input shader.
-func (fxc *FXC) Compile(path, variant string, input []byte, entryPoint string, profileVersion string) (string, error) {
-	base := fxc.WorkDir.Path(filepath.Base(path), variant, profileVersion)
-	pathin := base + ".in"
-	pathout := base + ".out"
-	result := pathout
-
-	if err := fxc.WorkDir.WriteFile(pathin, input); err != nil {
-		return "", fmt.Errorf("unable to write shader to disk: %w", err)
-	}
-
-	cmd := exec.Command(fxc.Bin)
-	if runtime.GOOS != "windows" {
-		cmd = exec.Command("wine", fxc.Bin)
-		if err := winepath(&pathin, &pathout); err != nil {
-			return "", err
-		}
-	}
-
-	var profile string
-	switch filepath.Ext(path) {
-	case ".frag":
-		profile = "ps_" + profileVersion
-	case ".vert":
-		profile = "vs_" + profileVersion
-	case ".comp":
-		profile = "cs_" + profileVersion
-	default:
-		return "", fmt.Errorf("unrecognized shader type %s", path)
-	}
-
-	cmd.Args = append(cmd.Args,
-		"/Fo", pathout,
-		"/T", profile,
-		"/E", entryPoint,
-		pathin,
-	)
-
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		info := ""
-		if runtime.GOOS != "windows" {
-			info = "If the fxc tool cannot be found, set WINEPATH to the Windows path for the Windows SDK.\n"
-		}
-		return "", fmt.Errorf("%s\n%sfailed to run %v: %w", output, info, cmd.Args, err)
-	}
-
-	compiled, err := ioutil.ReadFile(result)
-	if err != nil {
-		return "", fmt.Errorf("unable to read output %q: %w", pathout, err)
-	}
-
-	return string(compiled), nil
-}
-
-// DXC is hlsl compiler that targets ShaderModel 6.0 and newer.
-type DXC struct {
-	Bin     string
-	WorkDir WorkDir
-}
-
-func NewDXC() *DXC { return &DXC{Bin: "dxc"} }
-
-// Compile compiles the input shader.
-func (dxc *DXC) Compile(path, variant string, input []byte, entryPoint string, profile string) (string, error) {
-	base := dxc.WorkDir.Path(filepath.Base(path), variant, profile)
-	pathin := base + ".in"
-	pathout := base + ".out"
-	result := pathout
-
-	if err := dxc.WorkDir.WriteFile(pathin, input); err != nil {
-		return "", fmt.Errorf("unable to write shader to disk: %w", err)
-	}
-
-	cmd := exec.Command(dxc.Bin)
-
-	cmd.Args = append(cmd.Args,
-		"-Fo", pathout,
-		"-T", profile,
-		"-E", entryPoint,
-		"-Qstrip_reflect",
-		pathin,
-	)
-
-	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("%s\nfailed to run %v: %w", output, cmd.Args, err)
-	}
-
-	compiled, err := ioutil.ReadFile(result)
-	if err != nil {
-		return "", fmt.Errorf("unable to read output %q: %w", pathout, err)
-	}
-
-	return string(compiled), nil
-}
-
-// winepath uses the winepath tool to convert a paths to Windows format.
-// The returned path can be used as arguments for Windows command line tools.
-func winepath(paths ...*string) error {
-	winepath := exec.Command("winepath", "--windows")
-	for _, path := range paths {
-		winepath.Args = append(winepath.Args, *path)
-	}
-	// Use a pipe instead of Output, because winepath may have left wineserver
-	// running for several seconds as a grandchild.
-	out, err := winepath.StdoutPipe()
-	if err != nil {
-		return fmt.Errorf("unable to start winepath: %w", err)
-	}
-	if err := winepath.Start(); err != nil {
-		return fmt.Errorf("unable to start winepath: %w", err)
-	}
-	var buf bytes.Buffer
-	if _, err := io.Copy(&buf, out); err != nil {
-		return fmt.Errorf("unable to run winepath: %w", err)
-	}
-	winPaths := strings.Split(strings.TrimSpace(buf.String()), "\n")
-	for i, path := range paths {
-		*path = winPaths[i]
-	}
-	return nil
-}
@@ -1,418 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package main
-
-import (
-	"bytes"
-	"crypto/sha256"
-	"encoding/hex"
-	"errors"
-	"flag"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"sort"
-	"strconv"
-	"strings"
-	"sync"
-	"text/template"
-
-	"gioui.org/gpu/internal/driver"
-)
-
-func main() {
-	packageName := flag.String("package", "", "specify Go package name")
-	workdir := flag.String("work", "", "temporary working directory (default TEMP)")
-	shadersDir := flag.String("dir", "shaders", "shaders directory")
-	directCompute := flag.Bool("directcompute", false, "enable compiling DirectCompute shaders")
-
-	flag.Parse()
-
-	var work WorkDir
-	cleanup := func() {}
-	if *workdir == "" {
-		tempdir, err := ioutil.TempDir("", "shader-convert")
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "failed to create tempdir: %v\n", err)
-			os.Exit(1)
-		}
-		cleanup = func() { os.RemoveAll(tempdir) }
-		defer cleanup()
-
-		work = WorkDir(tempdir)
-	} else {
-		if abs, err := filepath.Abs(*workdir); err == nil {
-			*workdir = abs
-		}
-		work = WorkDir(*workdir)
-	}
-
-	var out bytes.Buffer
-	conv := NewConverter(work, *packageName, *shadersDir, *directCompute)
-	if err := conv.Run(&out); err != nil {
-		fmt.Fprintf(os.Stderr, "%v\n", err)
-		cleanup()
-		os.Exit(1)
-	}
-
-	if err := ioutil.WriteFile("shaders.go", out.Bytes(), 0644); err != nil {
-		fmt.Fprintf(os.Stderr, "failed to create shaders: %v\n", err)
-		cleanup()
-		os.Exit(1)
-	}
-
-	cmd := exec.Command("gofmt", "-s", "-w", "shaders.go")
-	cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr
-	if err := cmd.Run(); err != nil {
-		fmt.Fprintf(os.Stderr, "formatting shaders.go failed: %v\n", err)
-		cleanup()
-		os.Exit(1)
-	}
-}
-
-type Converter struct {
-	workDir       WorkDir
-	shadersDir    string
-	directCompute bool
-
-	packageName string
-
-	glslvalidator *GLSLValidator
-	spirv         *SPIRVCross
-	fxc           *FXC
-}
-
-func NewConverter(workDir WorkDir, packageName, shadersDir string, directCompute bool) *Converter {
-	if abs, err := filepath.Abs(shadersDir); err == nil {
-		shadersDir = abs
-	}
-
-	conv := &Converter{}
-	conv.workDir = workDir
-	conv.shadersDir = shadersDir
-	conv.directCompute = directCompute
-
-	conv.packageName = packageName
-
-	conv.glslvalidator = NewGLSLValidator()
-	conv.spirv = NewSPIRVCross()
-	conv.fxc = NewFXC()
-
-	verifyBinaryPath(&conv.glslvalidator.Bin)
-	verifyBinaryPath(&conv.spirv.Bin)
-	// We cannot check fxc since it may depend on wine.
-
-	conv.glslvalidator.WorkDir = workDir.Dir("glslvalidator")
-	conv.fxc.WorkDir = workDir.Dir("fxc")
-	conv.spirv.WorkDir = workDir.Dir("spirv")
-
-	return conv
-}
-
-func verifyBinaryPath(bin *string) {
-	new, err := exec.LookPath(*bin)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "unable to find %q: %v\n", *bin, err)
-	} else {
-		*bin = new
-	}
-}
-
-func (conv *Converter) Run(out io.Writer) error {
-	shaders, err := filepath.Glob(filepath.Join(conv.shadersDir, "*"))
-	if len(shaders) == 0 || err != nil {
-		return fmt.Errorf("failed to list shaders in %q: %w", conv.shadersDir, err)
-	}
-
-	sort.Strings(shaders)
-
-	var workers Workers
-
-	type ShaderResult struct {
-		Path    string
-		Shaders []driver.ShaderSources
-		Error   error
-	}
-	shaderResults := make([]ShaderResult, len(shaders))
-
-	for i, shaderPath := range shaders {
-		i, shaderPath := i, shaderPath
-
-		switch filepath.Ext(shaderPath) {
-		case ".vert", ".frag":
-			workers.Go(func() {
-				shaders, err := conv.Shader(shaderPath)
-				shaderResults[i] = ShaderResult{
-					Path:    shaderPath,
-					Shaders: shaders,
-					Error:   err,
-				}
-			})
-		case ".comp":
-			workers.Go(func() {
-				shaders, err := conv.ComputeShader(shaderPath)
-				shaderResults[i] = ShaderResult{
-					Path:    shaderPath,
-					Shaders: shaders,
-					Error:   err,
-				}
-			})
-		default:
-			continue
-		}
-	}
-
-	workers.Wait()
-
-	var allErrors string
-	for _, r := range shaderResults {
-		if r.Error != nil {
-			if len(allErrors) > 0 {
-				allErrors += "\n\n"
-			}
-			allErrors += "--- " + r.Path + " --- \n\n" + r.Error.Error() + "\n"
-		}
-	}
-	if len(allErrors) > 0 {
-		return errors.New(allErrors)
-	}
-
-	fmt.Fprintf(out, "// Code generated by build.go. DO NOT EDIT.\n\n")
-	fmt.Fprintf(out, "package %s\n\n", conv.packageName)
-	fmt.Fprintf(out, "import %q\n\n", "gioui.org/gpu/internal/driver")
-
-	fmt.Fprintf(out, "var (\n")
-
-	for _, r := range shaderResults {
-		if len(r.Shaders) == 0 {
-			continue
-		}
-
-		name := filepath.Base(r.Path)
-		name = strings.ReplaceAll(name, ".", "_")
-		fmt.Fprintf(out, "\tshader_%s = ", name)
-
-		multiVariant := len(r.Shaders) > 1
-		if multiVariant {
-			fmt.Fprintf(out, "[...]driver.ShaderSources{\n")
-		}
-
-		for _, src := range r.Shaders {
-			fmt.Fprintf(out, "driver.ShaderSources{\n")
-			fmt.Fprintf(out, "Name: %#v,\n", src.Name)
-			if len(src.Inputs) > 0 {
-				fmt.Fprintf(out, "Inputs: %#v,\n", src.Inputs)
-			}
-			if u := src.Uniforms; len(u.Blocks) > 0 {
-				fmt.Fprintf(out, "Uniforms: driver.UniformsReflection{\n")
-				fmt.Fprintf(out, "Blocks: %#v,\n", u.Blocks)
-				fmt.Fprintf(out, "Locations: %#v,\n", u.Locations)
-				fmt.Fprintf(out, "Size: %d,\n", u.Size)
-				fmt.Fprintf(out, "},\n")
-			}
-			if len(src.Textures) > 0 {
-				fmt.Fprintf(out, "Textures: %#v,\n", src.Textures)
-			}
-			if len(src.GLSL100ES) > 0 {
-				fmt.Fprintf(out, "GLSL100ES: `%s`,\n", src.GLSL100ES)
-			}
-			if len(src.GLSL300ES) > 0 {
-				fmt.Fprintf(out, "GLSL300ES: `%s`,\n", src.GLSL300ES)
-			}
-			if len(src.GLSL310ES) > 0 {
-				fmt.Fprintf(out, "GLSL310ES: `%s`,\n", src.GLSL310ES)
-			}
-			if len(src.GLSL130) > 0 {
-				fmt.Fprintf(out, "GLSL130: `%s`,\n", src.GLSL130)
-			}
-			if len(src.GLSL150) > 0 {
-				fmt.Fprintf(out, "GLSL150: `%s`,\n", src.GLSL150)
-			}
-			if len(src.HLSL) > 0 {
-				fmt.Fprintf(out, "HLSL: %q,\n", src.HLSL)
-			}
-			if len(src.Hash) > 0 {
-				fmt.Fprintf(out, "Hash: %q,\n", src.Hash)
-			}
-			fmt.Fprintf(out, "}")
-			if multiVariant {
-				fmt.Fprintf(out, ",")
-			}
-			fmt.Fprintf(out, "\n")
-		}
-		if multiVariant {
-			fmt.Fprintf(out, "}\n")
-		}
-	}
-	fmt.Fprintf(out, ")\n")
-
-	return nil
-}
-
-func (conv *Converter) Shader(shaderPath string) ([]driver.ShaderSources, error) {
-	type Variant struct {
-		FetchColorExpr string
-		Header         string
-	}
-	variantArgs := [...]Variant{
-		{
-			FetchColorExpr: `_color.color`,
-			Header:         `layout(binding=0) uniform Color { vec4 color; } _color;`,
-		},
-		{
-			FetchColorExpr: `mix(_gradient.color1, _gradient.color2, clamp(vUV.x, 0.0, 1.0))`,
-			Header:         `layout(binding=0) uniform Gradient { vec4 color1; vec4 color2; } _gradient;`,
-		},
-		{
-			FetchColorExpr: `texture(tex, vUV)`,
-			Header:         `layout(binding=0) uniform sampler2D tex;`,
-		},
-	}
-
-	shaderTemplate, err := template.ParseFiles(shaderPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse template %q: %w", shaderPath, err)
-	}
-
-	var variants []driver.ShaderSources
-	for i, variantArg := range variantArgs {
-		variantName := strconv.Itoa(i)
-		var buf bytes.Buffer
-		err := shaderTemplate.Execute(&buf, variantArg)
-		if err != nil {
-			return nil, fmt.Errorf("failed to execute template %q with %#v: %w", shaderPath, variantArg, err)
-		}
-
-		var sources driver.ShaderSources
-		sources.Name = filepath.Base(shaderPath)
-
-		// Ignore error; some shaders are not meant to run in GLSL 1.00.
-		sources.GLSL100ES, _, _ = conv.ShaderVariant(shaderPath, variantName, buf.Bytes(), "es", "100")
-
-		var metadata Metadata
-		sources.GLSL300ES, metadata, err = conv.ShaderVariant(shaderPath, variantName, buf.Bytes(), "es", "300")
-		if err != nil {
-			return nil, fmt.Errorf("failed to convert GLSL300ES:\n%w", err)
-		}
-
-		sources.GLSL130, _, err = conv.ShaderVariant(shaderPath, variantName, buf.Bytes(), "glsl", "130")
-		if err != nil {
-			return nil, fmt.Errorf("failed to convert GLSL130:\n%w", err)
-		}
-
-		hlsl, _, err := conv.ShaderVariant(shaderPath, variantName, buf.Bytes(), "hlsl", "40")
-		if err != nil {
-			return nil, fmt.Errorf("failed to convert HLSL:\n%w", err)
-		}
-		sources.HLSL, err = conv.fxc.Compile(shaderPath, variantName, []byte(hlsl), "main", "4_0_level_9_1")
-		if err != nil {
-			// Attempt shader model 4.0. Only the gpu/headless
-			// test shaders use features not supported by level
-			// 9.1.
-			sources.HLSL, err = conv.fxc.Compile(shaderPath, variantName, []byte(hlsl), "main", "4_0")
-			if err != nil {
-				return nil, fmt.Errorf("failed to compile HLSL: %w", err)
-			}
-		}
-
-		sources.GLSL150, _, err = conv.ShaderVariant(shaderPath, variantName, buf.Bytes(), "glsl", "150")
-		if err != nil {
-			return nil, fmt.Errorf("failed to convert GLSL150:\n%w", err)
-		}
-
-		sources.Uniforms = metadata.Uniforms
-		sources.Inputs = metadata.Inputs
-		sources.Textures = metadata.Textures
-
-		variants = append(variants, sources)
-	}
-
-	// If the shader don't use the variant arguments, output only a single version.
-	if variants[0].GLSL100ES == variants[1].GLSL100ES {
-		variants = variants[:1]
-	}
-
-	return variants, nil
-}
-
-func (conv *Converter) ShaderVariant(shaderPath, variant string, src []byte, lang, profile string) (string, Metadata, error) {
-	spirv, err := conv.glslvalidator.Convert(shaderPath, variant, lang == "hlsl", src)
-	if err != nil {
-		return "", Metadata{}, fmt.Errorf("failed to generate SPIR-V for %q: %w", shaderPath, err)
-	}
-
-	dst, err := conv.spirv.Convert(shaderPath, variant, spirv, lang, profile)
-	if err != nil {
-		return "", Metadata{}, fmt.Errorf("failed to convert shader %q: %w", shaderPath, err)
-	}
-
-	meta, err := conv.spirv.Metadata(shaderPath, variant, spirv)
-	if err != nil {
-		return "", Metadata{}, fmt.Errorf("failed to extract metadata for shader %q: %w", shaderPath, err)
-	}
-
-	return dst, meta, nil
-}
-
-func (conv *Converter) ComputeShader(shaderPath string) ([]driver.ShaderSources, error) {
-	shader, err := ioutil.ReadFile(shaderPath)
-	if err != nil {
-		return nil, fmt.Errorf("failed to load shader %q: %w", shaderPath, err)
-	}
-
-	spirv, err := conv.glslvalidator.Convert(shaderPath, "", false, shader)
-	if err != nil {
-		return nil, fmt.Errorf("failed to convert compute shader %q: %w", shaderPath, err)
-	}
-
-	var sources driver.ShaderSources
-	sources.Name = filepath.Base(shaderPath)
-
-	sum := sha256.Sum256(shader)
-	sources.Hash = hex.EncodeToString(sum[:])
-
-	sources.GLSL310ES, err = conv.spirv.Convert(shaderPath, "", spirv, "es", "310")
-	if err != nil {
-		return nil, fmt.Errorf("failed to convert es compute shader %q: %w", shaderPath, err)
-	}
-	sources.GLSL310ES = unixLineEnding(sources.GLSL310ES)
-
-	hlslSource, err := conv.spirv.Convert(shaderPath, "", spirv, "hlsl", "50")
-	if err != nil {
-		return nil, fmt.Errorf("failed to convert hlsl compute shader %q: %w", shaderPath, err)
-	}
-
-	dxil, err := conv.fxc.Compile(shaderPath, "0", []byte(hlslSource), "main", "5_0")
-	if err != nil {
-		return nil, fmt.Errorf("failed to compile hlsl compute shader %q: %w", shaderPath, err)
-	}
-	if conv.directCompute {
-		sources.HLSL = dxil
-	}
-
-	return []driver.ShaderSources{sources}, nil
-}
-
-// Workers implements wait group with synchronous logging.
-type Workers struct {
-	running sync.WaitGroup
-}
-
-func (lg *Workers) Go(fn func()) {
-	lg.running.Add(1)
-	go func() {
-		defer lg.running.Done()
-		fn()
-	}()
-}
-
-func (lg *Workers) Wait() {
-	lg.running.Wait()
-}
-
-func unixLineEnding(s string) string {
-	return strings.ReplaceAll(s, "\r\n", "\n")
-}
@@ -1,212 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package main
-
-import (
-	"encoding/json"
-	"fmt"
-	"os/exec"
-	"path/filepath"
-	"sort"
-	"strings"
-
-	"gioui.org/gpu/internal/driver"
-)
-
-// Metadata contains reflection data about a shader.
-type Metadata struct {
-	Uniforms driver.UniformsReflection
-	Inputs   []driver.InputLocation
-	Textures []driver.TextureBinding
-}
-
-// SPIRVCross cross-compiles spirv shaders to es, hlsl and others.
-type SPIRVCross struct {
-	Bin     string
-	WorkDir WorkDir
-}
-
-func NewSPIRVCross() *SPIRVCross { return &SPIRVCross{Bin: "spirv-cross"} }
-
-// Convert converts compute shader from spirv format to a target format.
-func (spirv *SPIRVCross) Convert(path, variant string, shader []byte, target, version string) (string, error) {
-	base := spirv.WorkDir.Path(filepath.Base(path), variant)
-
-	if err := spirv.WorkDir.WriteFile(base, shader); err != nil {
-		return "", fmt.Errorf("unable to write shader to disk: %w", err)
-	}
-
-	var cmd *exec.Cmd
-	switch target {
-	case "glsl":
-		cmd = exec.Command(spirv.Bin,
-			"--no-es",
-			"--version", version,
-		)
-	case "es":
-		cmd = exec.Command(spirv.Bin,
-			"--es",
-			"--version", version,
-		)
-	case "hlsl":
-		cmd = exec.Command(spirv.Bin,
-			"--hlsl",
-			"--shader-model", version,
-		)
-	default:
-		return "", fmt.Errorf("unknown target %q", target)
-	}
-	cmd.Args = append(cmd.Args, "--no-420pack-extension", base)
-
-	out, err := cmd.CombinedOutput()
-	if err != nil {
-		return "", fmt.Errorf("%s\nfailed to run %v: %w", out, cmd.Args, err)
-	}
-	s := string(out)
-	if target != "hlsl" {
-		// Strip Windows \r in line endings.
-		s = unixLineEnding(s)
-	}
-
-	return s, nil
-}
-
-// Metadata extracts metadata for a SPIR-V shader.
-func (spirv *SPIRVCross) Metadata(path, variant string, shader []byte) (Metadata, error) {
-	base := spirv.WorkDir.Path(filepath.Base(path), variant)
-
-	if err := spirv.WorkDir.WriteFile(base, shader); err != nil {
-		return Metadata{}, fmt.Errorf("unable to write shader to disk: %w", err)
-	}
-
-	cmd := exec.Command(spirv.Bin,
-		base,
-		"--reflect",
-	)
-
-	out, err := cmd.Output()
-	if err != nil {
-		return Metadata{}, fmt.Errorf("failed to run %v: %w", cmd.Args, err)
-	}
-
-	meta, err := parseMetadata(out)
-	if err != nil {
-		return Metadata{}, fmt.Errorf("%s\nfailed to parse metadata: %w", out, err)
-	}
-
-	return meta, nil
-}
-
-func parseMetadata(data []byte) (Metadata, error) {
-	var reflect struct {
-		Types map[string]struct {
-			Name    string `json:"name"`
-			Members []struct {
-				Name   string `json:"name"`
-				Type   string `json:"type"`
-				Offset int    `json:"offset"`
-			} `json:"members"`
-		} `json:"types"`
-		Inputs []struct {
-			Name     string `json:"name"`
-			Type     string `json:"type"`
-			Location int    `json:"location"`
-		} `json:"inputs"`
-		Textures []struct {
-			Name    string `json:"name"`
-			Type    string `json:"type"`
-			Set     int    `json:"set"`
-			Binding int    `json:"binding"`
-		} `json:"textures"`
-		UBOs []struct {
-			Name      string `json:"name"`
-			Type      string `json:"type"`
-			BlockSize int    `json:"block_size"`
-			Set       int    `json:"set"`
-			Binding   int    `json:"binding"`
-		} `json:"ubos"`
-	}
-	if err := json.Unmarshal(data, &reflect); err != nil {
-		return Metadata{}, fmt.Errorf("failed to parse reflection data: %w", err)
-	}
-
-	var m Metadata
-
-	for _, input := range reflect.Inputs {
-		dataType, dataSize, err := parseDataType(input.Type)
-		if err != nil {
-			return Metadata{}, fmt.Errorf("parseReflection: %v", err)
-		}
-		m.Inputs = append(m.Inputs, driver.InputLocation{
-			Name:          input.Name,
-			Location:      input.Location,
-			Semantic:      "TEXCOORD",
-			SemanticIndex: input.Location,
-			Type:          dataType,
-			Size:          dataSize,
-		})
-	}
-
-	sort.Slice(m.Inputs, func(i, j int) bool {
-		return m.Inputs[i].Location < m.Inputs[j].Location
-	})
-
-	blockOffset := 0
-	for _, block := range reflect.UBOs {
-		m.Uniforms.Blocks = append(m.Uniforms.Blocks, driver.UniformBlock{
-			Name:    block.Name,
-			Binding: block.Binding,
-		})
-		t := reflect.Types[block.Type]
-		// By convention uniform block variables are named by prepending an underscore
-		// and converting to lowercase.
-		blockVar := "_" + strings.ToLower(block.Name)
-		for _, member := range t.Members {
-			dataType, size, err := parseDataType(member.Type)
-			if err != nil {
-				return Metadata{}, fmt.Errorf("failed to parse reflection data: %v", err)
-			}
-			m.Uniforms.Locations = append(m.Uniforms.Locations, driver.UniformLocation{
-				Name:   fmt.Sprintf("%s.%s", blockVar, member.Name),
-				Type:   dataType,
-				Size:   size,
-				Offset: blockOffset + member.Offset,
-			})
-		}
-		blockOffset += block.BlockSize
-	}
-	m.Uniforms.Size = blockOffset
-
-	for _, texture := range reflect.Textures {
-		m.Textures = append(m.Textures, driver.TextureBinding{
-			Name:    texture.Name,
-			Binding: texture.Binding,
-		})
-	}
-
-	//return m, fmt.Errorf("not yet!: %+v", reflect)
-	return m, nil
-}
-
-func parseDataType(t string) (driver.DataType, int, error) {
-	switch t {
-	case "float":
-		return driver.DataTypeFloat, 1, nil
-	case "vec2":
-		return driver.DataTypeFloat, 2, nil
-	case "vec3":
-		return driver.DataTypeFloat, 3, nil
-	case "vec4":
-		return driver.DataTypeFloat, 4, nil
-	case "int":
-		return driver.DataTypeInt, 1, nil
-	case "int2":
-		return driver.DataTypeInt, 2, nil
-	case "int3":
-		return driver.DataTypeInt, 3, nil
-	case "int4":
-		return driver.DataTypeInt, 4, nil
-	default:
-		return 0, 0, fmt.Errorf("unsupported input data type: %s", t)
-	}
-}
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-package main
-
-import (
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"strings"
-)
-
-type WorkDir string
-
-func (wd WorkDir) Dir(path string) WorkDir {
-	dirname := filepath.Join(string(wd), path)
-	if err := os.Mkdir(dirname, 0755); err != nil {
-		if !os.IsExist(err) {
-			fmt.Fprintf(os.Stderr, "failed to create %q: %v\n", dirname, err)
-		}
-	}
-	return WorkDir(dirname)
-}
-
-func (wd WorkDir) Path(path ...string) (fullpath string) {
-	return filepath.Join(string(wd), strings.Join(path, "."))
-}
-
-func (wd WorkDir) WriteFile(path string, data []byte) error {
-	err := ioutil.WriteFile(path, data, 0644)
-	if err != nil {
-		return fmt.Errorf("unable to create %v: %w", path, err)
-	}
-	return nil
-}
@@ -14,6 +14,7 @@ import (

 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/d3d11"
+	"gioui.org/shader"
 )

 type Backend struct {
@@ -287,7 +288,7 @@ func (b *Backend) NewFramebuffer(tex driver.Texture) (driver.Framebuffer, error)
 	return fbo, nil
 }

-func (b *Backend) NewInputLayout(vertexShader driver.ShaderSources, layout []driver.InputDesc) (driver.InputLayout, error) {
+func (b *Backend) NewInputLayout(vertexShader shader.Sources, layout []shader.InputDesc) (driver.InputLayout, error) {
 	if len(vertexShader.Inputs) != len(layout) {
 		return nil, fmt.Errorf("NewInputLayout: got %d inputs, expected %d", len(layout), len(vertexShader.Inputs))
 	}
@@ -300,7 +301,7 @@ func (b *Backend) NewInputLayout(vertexShader driver.ShaderSources, layout []dri
 		}
 		var format uint32
 		switch l.Type {
-		case driver.DataTypeFloat:
+		case shader.DataTypeFloat:
 			switch l.Size {
 			case 1:
 				format = d3d11.DXGI_FORMAT_R32_FLOAT
@@ -313,7 +314,7 @@ func (b *Backend) NewInputLayout(vertexShader driver.ShaderSources, layout []dri
 			default:
 				panic("unsupported data size")
 			}
-		case driver.DataTypeShort:
+		case shader.DataTypeShort:
 			switch l.Size {
 			case 1:
 				format = d3d11.DXGI_FORMAT_R16_SINT
@@ -332,7 +333,7 @@ func (b *Backend) NewInputLayout(vertexShader driver.ShaderSources, layout []dri
 			AlignedByteOffset: uint32(l.Offset),
 		}
 	}
-	l, err := b.dev.CreateInputLayout(descs, []byte(vertexShader.HLSL))
+	l, err := b.dev.CreateInputLayout(descs, []byte(vertexShader.DXBC))
 	if err != nil {
 		return nil, err
 	}
@@ -380,16 +381,16 @@ func (b *Backend) NewImmutableBuffer(typ driver.BufferBinding, data []byte) (dri
 	return &Buffer{backend: b, buf: buf, bind: bind, immutable: true}, nil
 }

-func (b *Backend) NewComputeProgram(shader driver.ShaderSources) (driver.Program, error) {
+func (b *Backend) NewComputeProgram(shader shader.Sources) (driver.Program, error) {
 	panic("not implemented")
 }

-func (b *Backend) NewProgram(vertexShader, fragmentShader driver.ShaderSources) (driver.Program, error) {
-	vs, err := b.dev.CreateVertexShader([]byte(vertexShader.HLSL))
+func (b *Backend) NewProgram(vertexShader, fragmentShader shader.Sources) (driver.Program, error) {
+	vs, err := b.dev.CreateVertexShader([]byte(vertexShader.DXBC))
 	if err != nil {
 		return nil, err
 	}
-	ps, err := b.dev.CreatePixelShader([]byte(fragmentShader.HLSL))
+	ps, err := b.dev.CreatePixelShader([]byte(fragmentShader.DXBC))
 	if err != nil {
 		return nil, err
 	}
@@ -6,6 +6,8 @@ import (
 	"errors"
 	"image"
 	"time"
+
+	"gioui.org/shader"
 )

 // Device represents the abstraction of underlying GPU
@@ -23,9 +25,9 @@ type Device interface {
 	NewFramebuffer(tex Texture) (Framebuffer, error)
 	NewImmutableBuffer(typ BufferBinding, data []byte) (Buffer, error)
 	NewBuffer(typ BufferBinding, size int) (Buffer, error)
-	NewComputeProgram(shader ShaderSources) (Program, error)
-	NewProgram(vertexShader, fragmentShader ShaderSources) (Program, error)
-	NewInputLayout(vertexShader ShaderSources, layout []InputDesc) (InputLayout, error)
+	NewComputeProgram(shader shader.Sources) (Program, error)
+	NewProgram(vertexShader, fragmentShader shader.Sources) (Program, error)
+	NewInputLayout(vertexShader shader.Sources, layout []shader.InputDesc) (InputLayout, error)

 	Clear(r, g, b, a float32)
 	Viewport(x, y, width, height int)
@@ -49,63 +51,6 @@ type Device interface {
 	Release()
 }

-type ShaderSources struct {
-	Name      string
-	GLSL100ES string
-	GLSL300ES string
-	GLSL310ES string
-	GLSL130   string
-	GLSL150   string
-	HLSL      string
-	Uniforms  UniformsReflection
-	Inputs    []InputLocation
-	Textures  []TextureBinding
-	Hash      string
-}
-
-type UniformsReflection struct {
-	Blocks    []UniformBlock
-	Locations []UniformLocation
-	Size      int
-}
-
-type TextureBinding struct {
-	Name    string
-	Binding int
-}
-
-type UniformBlock struct {
-	Name    string
-	Binding int
-}
-
-type UniformLocation struct {
-	Name   string
-	Type   DataType
-	Size   int
-	Offset int
-}
-
-type InputLocation struct {
-	// For GLSL.
-	Name     string
-	Location int
-	// For HLSL.
-	Semantic      string
-	SemanticIndex int
-
-	Type DataType
-	Size int
-}
-
-// InputDesc describes a vertex attribute as laid out in a Buffer.
-type InputDesc struct {
-	Type DataType
-	Size int
-
-	Offset int
-}
-
 // InputLayout is the driver specific representation of the mapping
 // between Buffers and shader attributes.
 type InputLayout interface {
@@ -123,8 +68,6 @@ type TextureFormat uint8

 type BufferBinding uint8

-type DataType uint8
-
 type Features uint

 type Caps struct {
@@ -167,12 +110,6 @@ type Texture interface {
 	Release()
 }

-const (
-	DataTypeFloat DataType = iota
-	DataTypeInt
-	DataTypeShort
-)
-
 const (
 	BufferBindingIndices BufferBinding = 1 << iota
 	BufferBindingVertices
@@ -12,6 +12,7 @@ import (

 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/gl"
+	"gioui.org/shader"
 )

 // Backend implements driver.Device.
@@ -139,13 +140,13 @@ type uniformsTracker struct {
 type uniformLocation struct {
 	uniform gl.Uniform
 	offset  int
-	typ     driver.DataType
+	typ     shader.DataType
 	size    int
 }

 type gpuInputLayout struct {
-	inputs []driver.InputLocation
-	layout []driver.InputDesc
+	inputs []shader.InputLocation
+	layout []shader.InputDesc
 }

 // textureTriple holds the type settings for
@@ -846,7 +847,7 @@ func (b *Backend) Clear(colR, colG, colB, colA float32) {
 	b.funcs.Clear(gl.COLOR_BUFFER_BIT)
 }

-func (b *Backend) NewInputLayout(vs driver.ShaderSources, layout []driver.InputDesc) (driver.InputLayout, error) {
+func (b *Backend) NewInputLayout(vs shader.Sources, layout []shader.InputDesc) (driver.InputLayout, error) {
 	if len(vs.Inputs) != len(layout) {
 		return nil, fmt.Errorf("NewInputLayout: got %d inputs, expected %d", len(layout), len(vs.Inputs))
 	}
@@ -861,7 +862,7 @@ func (b *Backend) NewInputLayout(vs driver.ShaderSources, layout []driver.InputD
 	}, nil
 }

-func (b *Backend) NewComputeProgram(src driver.ShaderSources) (driver.Program, error) {
+func (b *Backend) NewComputeProgram(src shader.Sources) (driver.Program, error) {
 	p, err := gl.CreateComputeProgram(b.funcs, src.GLSL310ES)
 	if err != nil {
 		return nil, fmt.Errorf("%s: %v", src.Name, err)
@@ -873,7 +874,7 @@ func (b *Backend) NewComputeProgram(src driver.ShaderSources) (driver.Program, e
 	return gpuProg, nil
 }

-func (b *Backend) NewProgram(vertShader, fragShader driver.ShaderSources) (driver.Program, error) {
+func (b *Backend) NewProgram(vertShader, fragShader shader.Sources) (driver.Program, error) {
 	attr := make([]string, len(vertShader.Inputs))
 	for _, inp := range vertShader.Inputs {
 		attr[inp.Location] = inp.Name
@@ -937,7 +938,7 @@ func (b *Backend) NewProgram(vertShader, fragShader driver.ShaderSources) (drive
 	return gpuProg, nil
 }

-func lookupUniform(funcs *gl.Functions, p gl.Program, loc driver.UniformLocation) uniformLocation {
+func lookupUniform(funcs *gl.Functions, p gl.Program, loc shader.UniformLocation) uniformLocation {
 	u := funcs.GetUniformLocation(p, loc.Name)
 	if !u.Valid() {
 		panic(fmt.Errorf("uniform %q not found", loc.Name))
@@ -985,7 +986,7 @@ func (p *gpuProgram) Release() {
 	p.backend.glstate.deleteProgram(p.backend.funcs, p.obj)
 }

-func (u *uniformsTracker) setup(funcs *gl.Functions, p gl.Program, uniformSize int, uniforms []driver.UniformLocation) {
+func (u *uniformsTracker) setup(funcs *gl.Functions, p gl.Program, uniformSize int, uniforms []shader.UniformLocation) {
 	u.locs = make([]uniformLocation, len(uniforms))
 	for i, uniform := range uniforms {
 		u.locs[i] = lookupUniform(funcs, p, uniform)
@@ -1016,19 +1017,19 @@ func (p *uniformsTracker) update(funcs *gl.Functions) {
 	for _, u := range p.locs {
 		data := data[u.offset:]
 		switch {
-		case u.typ == driver.DataTypeFloat && u.size == 1:
+		case u.typ == shader.DataTypeFloat && u.size == 1:
 			data := data[:4]
 			v := *(*[1]float32)(unsafe.Pointer(&data[0]))
 			funcs.Uniform1f(u.uniform, v[0])
-		case u.typ == driver.DataTypeFloat && u.size == 2:
+		case u.typ == shader.DataTypeFloat && u.size == 2:
 			data := data[:8]
 			v := *(*[2]float32)(unsafe.Pointer(&data[0]))
 			funcs.Uniform2f(u.uniform, v[0], v[1])
-		case u.typ == driver.DataTypeFloat && u.size == 3:
+		case u.typ == shader.DataTypeFloat && u.size == 3:
 			data := data[:12]
 			v := *(*[3]float32)(unsafe.Pointer(&data[0]))
 			funcs.Uniform3f(u.uniform, v[0], v[1], v[2])
-		case u.typ == driver.DataTypeFloat && u.size == 4:
+		case u.typ == shader.DataTypeFloat && u.size == 4:
 			data := data[:16]
 			v := *(*[4]float32)(unsafe.Pointer(&data[0]))
 			funcs.Uniform4f(u.uniform, v[0], v[1], v[2], v[3])
@@ -1108,9 +1109,9 @@ func (b *Backend) setupVertexArrays() {
 		l := layout.layout[i]
 		var gltyp gl.Enum
 		switch l.Type {
-		case driver.DataTypeFloat:
+		case shader.DataTypeFloat:
 			gltyp = gl.FLOAT
-		case driver.DataTypeShort:
+		case shader.DataTypeShort:
 			gltyp = gl.SHORT
 		default:
 			panic("unsupported data type")
@@ -15,6 +15,8 @@ import (
 	"gioui.org/gpu/internal/driver"
 	"gioui.org/internal/byteslice"
 	"gioui.org/internal/f32color"
+	"gioui.org/shader"
+	"gioui.org/shader/gio"
 )

 type pather struct {
@@ -161,7 +163,7 @@ func newCoverer(ctx driver.Device) *coverer {
 	c.colUniforms = new(coverColUniforms)
 	c.texUniforms = new(coverTexUniforms)
 	c.linearGradientUniforms = new(coverLinearGradientUniforms)
-	prog, layout, err := createColorPrograms(ctx, shader_cover_vert, shader_cover_frag,
+	prog, layout, err := createColorPrograms(ctx, gio.Shader_cover_vert, gio.Shader_cover_frag,
 		[3]interface{}{&c.colUniforms.vert, &c.linearGradientUniforms.vert, &c.texUniforms.vert},
 		[3]interface{}{&c.colUniforms.frag, &c.linearGradientUniforms.frag, nil},
 	)
@@ -189,19 +191,19 @@ func newStenciler(ctx driver.Device) *stenciler {
 	if err != nil {
 		panic(err)
 	}
-	progLayout, err := ctx.NewInputLayout(shader_stencil_vert, []driver.InputDesc{
-		{Type: driver.DataTypeFloat, Size: 1, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).Corner))},
-		{Type: driver.DataTypeFloat, Size: 1, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).MaxY))},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).FromX))},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).CtrlX))},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).ToX))},
+	progLayout, err := ctx.NewInputLayout(gio.Shader_stencil_vert, []shader.InputDesc{
+		{Type: shader.DataTypeFloat, Size: 1, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).Corner))},
+		{Type: shader.DataTypeFloat, Size: 1, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).MaxY))},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).FromX))},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).CtrlX))},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: int(unsafe.Offsetof((*(*vertex)(nil)).ToX))},
 	})
 	if err != nil {
 		panic(err)
 	}
-	iprogLayout, err := ctx.NewInputLayout(shader_intersect_vert, []driver.InputDesc{
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 0},
-		{Type: driver.DataTypeFloat, Size: 2, Offset: 4 * 2},
+	iprogLayout, err := ctx.NewInputLayout(gio.Shader_intersect_vert, []shader.InputDesc{
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 0},
+		{Type: shader.DataTypeFloat, Size: 2, Offset: 4 * 2},
 	})
 	if err != nil {
 		panic(err)
@@ -210,7 +212,7 @@ func newStenciler(ctx driver.Device) *stenciler {
 		ctx:      ctx,
 		indexBuf: indexBuf,
 	}
-	prog, err := ctx.NewProgram(shader_stencil_vert, shader_stencil_frag)
+	prog, err := ctx.NewProgram(gio.Shader_stencil_vert, gio.Shader_stencil_frag)
 	if err != nil {
 		panic(err)
 	}
@@ -218,7 +220,7 @@ func newStenciler(ctx driver.Device) *stenciler {
 	vertUniforms := newUniformBuffer(ctx, &st.prog.uniforms.vert)
 	st.prog.prog = newProgram(prog, vertUniforms, nil)
 	st.prog.layout = progLayout
-	iprog, err := ctx.NewProgram(shader_intersect_vert, shader_intersect_frag)
+	iprog, err := ctx.NewProgram(gio.Shader_intersect_vert, gio.Shader_intersect_frag)
 	if err != nil {
 		panic(err)
 	}
@@ -1,225 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct AnnoImageRef {
-    uint offset;
-};
-
-struct AnnoColorRef {
-    uint offset;
-};
-
-struct AnnoBeginClipRef {
-    uint offset;
-};
-
-struct AnnoEndClipRef {
-    uint offset;
-};
-
-struct AnnotatedRef {
-    uint offset;
-};
-
-struct AnnoImage {
-    vec4 bbox;
-    float linewidth;
-    uint index;
-    ivec2 offset;
-};
-
-#define AnnoImage_size 28
-
-AnnoImageRef AnnoImage_index(AnnoImageRef ref, uint index) {
-    return AnnoImageRef(ref.offset + index * AnnoImage_size);
-}
-
-struct AnnoColor {
-    vec4 bbox;
-    float linewidth;
-    uint rgba_color;
-};
-
-#define AnnoColor_size 24
-
-AnnoColorRef AnnoColor_index(AnnoColorRef ref, uint index) {
-    return AnnoColorRef(ref.offset + index * AnnoColor_size);
-}
-
-struct AnnoBeginClip {
-    vec4 bbox;
-    float linewidth;
-};
-
-#define AnnoBeginClip_size 20
-
-AnnoBeginClipRef AnnoBeginClip_index(AnnoBeginClipRef ref, uint index) {
-    return AnnoBeginClipRef(ref.offset + index * AnnoBeginClip_size);
-}
-
-struct AnnoEndClip {
-    vec4 bbox;
-};
-
-#define AnnoEndClip_size 16
-
-AnnoEndClipRef AnnoEndClip_index(AnnoEndClipRef ref, uint index) {
-    return AnnoEndClipRef(ref.offset + index * AnnoEndClip_size);
-}
-
-#define Annotated_Nop 0
-#define Annotated_Color 1
-#define Annotated_Image 2
-#define Annotated_BeginClip 3
-#define Annotated_EndClip 4
-#define Annotated_size 32
-
-AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
-    return AnnotatedRef(ref.offset + index * Annotated_size);
-}
-
-struct AnnotatedTag {
-   uint tag;
-   uint flags;
-};
-
-AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    AnnoImage s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.index = raw5;
-    s.offset = ivec2(int(raw6 << 16) >> 16, int(raw6) >> 16);
-    return s;
-}
-
-void AnnoImage_write(Alloc a, AnnoImageRef ref, AnnoImage s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.index);
-    write_mem(a, ix + 6, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
-}
-
-AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    AnnoColor s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    s.rgba_color = raw5;
-    return s;
-}
-
-void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-    write_mem(a, ix + 5, s.rgba_color);
-}
-
-AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    AnnoBeginClip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.linewidth = uintBitsToFloat(raw4);
-    return s;
-}
-
-void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.linewidth));
-}
-
-AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    AnnoEndClip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.bbox.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.bbox.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.bbox.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.bbox.w));
-}
-
-AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return AnnotatedTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) {
-    return AnnoColor_read(a, AnnoColorRef(ref.offset + 4));
-}
-
-AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) {
-    return AnnoImage_read(a, AnnoImageRef(ref.offset + 4));
-}
-
-AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) {
-    return AnnoBeginClip_read(a, AnnoBeginClipRef(ref.offset + 4));
-}
-
-AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) {
-    return AnnoEndClip_read(a, AnnoEndClipRef(ref.offset + 4));
-}
-
-void Annotated_Nop_write(Alloc a, AnnotatedRef ref) {
-    write_mem(a, ref.offset >> 2, Annotated_Nop);
-}
-
-void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Color);
-    AnnoColor_write(a, AnnoColorRef(ref.offset + 4), s);
-}
-
-void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_Image);
-    AnnoImage_write(a, AnnoImageRef(ref.offset + 4), s);
-}
-
-void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginClip s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | Annotated_BeginClip);
-    AnnoBeginClip_write(a, AnnoBeginClipRef(ref.offset + 4), s);
-}
-
-void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) {
-    write_mem(a, ref.offset >> 2, Annotated_EndClip);
-    AnnoEndClip_write(a, AnnoEndClipRef(ref.offset + 4), s);
-}
-
@@ -1,109 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Propagation of tile backdrop for filling.
-//
-// Each thread reads one path element and calculates the number of spanned tiles
-// based on the bounding box.
-// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
-// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
-// and propagated from the left to the right (prefix summed).
-//
-// Output state:
-//  - Each path element has an array of tiles covering the whole path based on boundig box
-//  - Each tile per path element contains the 'backdrop' and a list of subdivided path segments
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
-#define BACKDROP_WG (1 << LG_BACKDROP_WG)
-
-layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "annotated.h"
-#include "tile.h"
-
-shared uint sh_row_count[BACKDROP_WG];
-shared Alloc sh_row_alloc[BACKDROP_WG];
-shared uint sh_row_width[BACKDROP_WG];
-
-void main() {
-    uint th_ix = gl_LocalInvocationID.x;
-    uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-
-    // Work assignment: 1 thread : 1 path element
-    uint row_count = 0;
-    bool mem_ok = mem_error == NO_ERROR;
-    if (element_ix < conf.n_elements) {
-        AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
-        switch (tag.tag) {
-        case Annotated_Image:
-        case Annotated_BeginClip:
-        case Annotated_Color:
-            if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
-                break;
-            }
-            // Fall through.
-            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-            Path path = Path_read(conf.tile_alloc, path_ref);
-            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-            row_count = path.bbox.w - path.bbox.y;
-            // Paths that don't cross tile top edges don't have backdrops.
-            // Don't apply the optimization to paths that may cross the y = 0
-            // top edge, but clipped to 1 row.
-            if (row_count == 1 && path.bbox.y > 0) {
-                // Note: this can probably be expanded to width = 2 as
-                // long as it doesn't cross the left edge.
-                row_count = 0;
-            }
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-            sh_row_alloc[th_ix] = path_alloc;
-        }
-    }
-
-    sh_row_count[th_ix] = row_count;
-    // Prefix sum of sh_row_count
-    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
-        barrier();
-        if (th_ix >= (1 << i)) {
-            row_count += sh_row_count[th_ix - (1 << i)];
-        }
-        barrier();
-        sh_row_count[th_ix] = row_count;
-    }
-    barrier();
-    // Work assignment: 1 thread : 1 path element row
-    uint total_rows = sh_row_count[BACKDROP_WG - 1];
-    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
-        // Binary search to find element
-        uint el_ix = 0;
-        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
-            uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
-            if (row >= sh_row_count[probe - 1]) {
-                el_ix = probe;
-            }
-        }
-        uint width = sh_row_width[el_ix];
-        if (width > 0 && mem_ok) {
-            // Process one row sequentially
-            // Read backdrop value per tile and prefix sum it
-            Alloc tiles_alloc = sh_row_alloc[el_ix];
-            uint seq_ix = row - (el_ix > 0 ? sh_row_count[el_ix - 1] : 0);
-            uint tile_el_ix = (tiles_alloc.offset >> 2) + 1 + seq_ix * 2 * width;
-            uint sum = read_mem(tiles_alloc, tile_el_ix);
-            for (uint x = 1; x < width; x++) {
-                tile_el_ix += 2;
-                sum += read_mem(tiles_alloc, tile_el_ix);
-                write_mem(tiles_alloc, tile_el_ix, sum);
-            }
-        }
-    }
-}
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The binning stage of the pipeline.
-//
-// Each workgroup processes N_TILE paths.
-// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
-// based on the path bounding box to bin the paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-layout(local_size_x = N_TILE, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "annotated.h"
-#include "bins.h"
-
-// scale factors useful for converting coordinates to bins
-#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
-#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
-
-// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
-#define INFINITY (1.0 / 0.0)
-
-// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
-// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
-shared uint bitmaps[N_SLICE][N_TILE];
-shared uint count[N_SLICE][N_TILE];
-shared Alloc sh_chunk_alloc[N_TILE];
-shared bool sh_alloc_failed;
-
-void main() {
-    uint my_n_elements = conf.n_elements;
-    uint my_partition = gl_WorkGroupID.x;
-
-    for (uint i = 0; i < N_SLICE; i++) {
-        bitmaps[i][gl_LocalInvocationID.x] = 0;
-    }
-    if (gl_LocalInvocationID.x == 0) {
-        sh_alloc_failed = false;
-    }
-    barrier();
-
-    // Read inputs and determine coverage of bins
-    uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-    uint tag = Annotated_Nop;
-    if (element_ix < my_n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
-    }
-    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
-        // Note: we take advantage of the fact that these drawing elements
-        // have the bbox at the same place in their layout.
-        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
-        x0 = int(floor(clip.bbox.x * SX));
-        y0 = int(floor(clip.bbox.y * SY));
-        x1 = int(ceil(clip.bbox.z * SX));
-        y1 = int(ceil(clip.bbox.w * SY));
-        break;
-    }
-
-    // At this point, we run an iterator over the coverage area,
-    // trying to keep divergence low.
-    // Right now, it's just a bbox, but we'll get finer with
-    // segments.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
-    uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
-    x0 = clamp(x0, 0, int(width_in_bins));
-    x1 = clamp(x1, x0, int(width_in_bins));
-    y0 = clamp(y0, 0, int(height_in_bins));
-    y1 = clamp(y1, y0, int(height_in_bins));
-    if (x0 == x1) y1 = y0;
-    int x = x0, y = y0;
-    uint my_slice = gl_LocalInvocationID.x / 32;
-    uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
-    while (y < y1) {
-        atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
-        x++;
-        if (x == x1) {
-            x = x0;
-            y++;
-        }
-    }
-
-    barrier();
-    // Allocate output segments.
-    uint element_count = 0;
-    for (uint i = 0; i < N_SLICE; i++) {
-        element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
-        count[i][gl_LocalInvocationID.x] = element_count;
-    }
-    // element_count is number of elements covering bin for this invocation.
-    Alloc chunk_alloc = new_alloc(0, 0, true);
-    if (element_count != 0) {
-        // TODO: aggregate atomic adds (subgroup is probably fastest)
-        MallocResult chunk = malloc(element_count * BinInstance_size);
-        chunk_alloc = chunk.alloc;
-        sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
-        if (chunk.failed) {
-            sh_alloc_failed = true;
-        }
-    }
-    // Note: it might be more efficient for reading to do this in the
-    // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    write_mem(conf.bin_alloc, out_ix, element_count);
-    write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
-
-    barrier();
-    if (sh_alloc_failed || mem_error != NO_ERROR) {
-        return;
-    }
-
-    // Use similar strategy as Laine & Karras paper; loop over bbox of bins
-    // touched by this element
-    x = x0;
-    y = y0;
-    while (y < y1) {
-        uint bin_ix = y * width_in_bins + x;
-        uint out_mask = bitmaps[my_slice][bin_ix];
-        if ((out_mask & my_mask) != 0) {
-            uint idx = bitCount(out_mask & (my_mask - 1));
-            if (my_slice > 0) {
-                idx += count[my_slice - 1][bin_ix];
-            }
-            Alloc out_alloc = sh_chunk_alloc[bin_ix];
-            uint out_offset = out_alloc.offset + idx * BinInstance_size;
-            BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
-        }
-        x++;
-        if (x == x1) {
-            x = x0;
-            y++;
-        }
-    }
-}
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct BinInstanceRef {
-    uint offset;
-};
-
-struct BinInstance {
-    uint element_ix;
-};
-
-#define BinInstance_size 4
-
-BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
-    return BinInstanceRef(ref.offset + index * BinInstance_size);
-}
-
-BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    BinInstance s;
-    s.element_ix = raw0;
-    return s;
-}
-
-void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.element_ix);
-}
-
@@ -1,15 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-layout(location=0) in vec2 vUV;
-
-{{.Header}}
-
-layout(location = 0) out vec4 fragColor;
-
-void main() {
-	fragColor = {{.FetchColorExpr}};
-}
@@ -1,28 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-#extension GL_GOOGLE_include_directive : enable
-
-precision highp float;
-
-#include "common.h"
-
-layout(binding = 0) uniform Block {
-	vec4 transform;
-	vec4 uvTransformR1;
-	vec4 uvTransformR2;
-	float z;
-} _block;
-
-layout(location = 0) in vec2 pos;
-
-layout(location = 1) in vec2 uv;
-
-layout(location = 0) out vec2 vUV;
-
-void main() {
-	vec2 p = pos*_block.transform.xy + _block.transform.zw;
-	gl_Position = toClipSpace(vec4(p, _block.z, 1));
-	vUV = transform3x2(m3x2(_block.uvTransformR1.xyz, _block.uvTransformR2.xyz), vec3(uv,1)).xy;
-}
@@ -1,426 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The coarse rasterizer stage of the pipeline.
-//
-// As input we have the ordered partitions of paths from the binning phase and
-// the annotated tile list of segments and backdrop per path.
-//
-// Each workgroup operating on one bin by stream compacting
-// the elements corresponding to the bin.
-//
-// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-layout(local_size_x = N_TILE, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "annotated.h"
-#include "bins.h"
-#include "tile.h"
-#include "ptcl.h"
-
-#define LG_N_PART_READ (7 + LG_WG_FACTOR)
-#define N_PART_READ (1 << LG_N_PART_READ)
-
-shared uint sh_elements[N_TILE];
-
-// Number of elements in the partition; prefix sum.
-shared uint sh_part_count[N_PART_READ];
-shared Alloc sh_part_elements[N_PART_READ];
-
-shared uint sh_bitmaps[N_SLICE][N_TILE];
-
-shared uint sh_tile_count[N_TILE];
-// The width of the tile rect for the element, intersected with this bin
-shared uint sh_tile_width[N_TILE];
-shared uint sh_tile_x0[N_TILE];
-shared uint sh_tile_y0[N_TILE];
-
-// These are set up so base + tile_y * stride + tile_x points to a Tile.
-shared uint sh_tile_base[N_TILE];
-shared uint sh_tile_stride[N_TILE];
-
-#ifdef MEM_DEBUG
-// Store allocs only when MEM_DEBUG to save shared memory traffic.
-shared Alloc sh_tile_alloc[N_TILE];
-
-void write_tile_alloc(uint el_ix, Alloc a) {
-    sh_tile_alloc[el_ix] = a;
-}
-
-Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
-    return sh_tile_alloc[el_ix];
-}
-#else
-void write_tile_alloc(uint el_ix, Alloc a) {
-    // No-op
-}
-
-Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
-    // All memory.
-    return new_alloc(0, memory.length()*4, mem_ok);
-}
-#endif
-
-// The maximum number of commands per annotated element.
-#define ANNO_COMMANDS 2
-
-// Perhaps cmd_alloc should be a global? This is a style question.
-bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset < cmd_limit) {
-        return true;
-    }
-    MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
-    if (new_cmd.failed) {
-        return false;
-    }
-    CmdJump jump = CmdJump(new_cmd.alloc.offset);
-    Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
-    cmd_alloc = new_cmd.alloc;
-    cmd_ref = CmdRef(cmd_alloc.offset);
-    // Reserve space for the maximum number of commands and a potential jump.
-    cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
-    return true;
-}
-
-void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) {
-    if (fill_mode_from_flags(flags) == MODE_NONZERO) {
-        if (tile.tile.offset != 0) {
-            CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
-            Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
-            cmd_ref.offset += 4 + CmdFill_size;
-        } else {
-            Cmd_Solid_write(alloc, cmd_ref);
-            cmd_ref.offset += 4;
-        }
-    } else {
-        CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
-        Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
-        cmd_ref.offset += 4 + CmdStroke_size;
-    }
-}
-
-void main() {
-    // Could use either linear or 2d layouts for both dispatch and
-    // invocations within the workgroup. We'll use variables to abstract.
-    uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
-    uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
-    uint partition_ix = 0;
-    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
-    uint th_ix = gl_LocalInvocationID.x;
-
-    // Coordinates of top left of bin, in tiles.
-    uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
-    uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
-
-    // Per-tile state
-    uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
-    uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
-    uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
-    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
-    // Reserve space for the maximum number of commands and a potential jump.
-    uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
-    // The nesting depth of the clip stack
-    uint clip_depth = 0;
-    // State for the "clip zero" optimization. If it's nonzero, then we are
-    // currently in a clip for which the entire tile has an alpha of zero, and
-    // the value is the depth after the "begin clip" of that element.
-    uint clip_zero_depth = 0;
-    // State for the "clip one" optimization. If bit `i` is set, then that means
-    // that the clip pushed at depth `i` has an alpha of all one.
-    uint clip_one_mask = 0;
-
-    // I'm sure we can figure out how to do this with at least one fewer register...
-    // Items up to rd_ix have been read from sh_elements
-    uint rd_ix = 0;
-    // Items up to wr_ix have been written into sh_elements
-    uint wr_ix = 0;
-    // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
-    uint part_start_ix = 0;
-    uint ready_ix = 0;
-
-    // Leave room for the fine rasterizer scratch allocation.
-    Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
-    cmd_ref.offset += Alloc_size;
-
-    uint num_begin_slots = 0;
-    uint begin_slot = 0;
-    bool mem_ok = mem_error == NO_ERROR;
-    while (true) {
-        for (uint i = 0; i < N_SLICE; i++) {
-            sh_bitmaps[i][th_ix] = 0;
-        }
-
-        // parallel read of input partitions
-        do {
-            if (ready_ix == wr_ix && partition_ix < n_partitions) {
-                part_start_ix = ready_ix;
-                uint count = 0;
-                if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = read_mem(conf.bin_alloc, in_ix);
-                    uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok);
-                }
-                // prefix sum of counts
-                for (uint i = 0; i < LG_N_PART_READ; i++) {
-                    if (th_ix < N_PART_READ) {
-                        sh_part_count[th_ix] = count;
-                    }
-                    barrier();
-                    if (th_ix < N_PART_READ) {
-                        if (th_ix >= (1 << i)) {
-                            count += sh_part_count[th_ix - (1 << i)];
-                        }
-                    }
-                    barrier();
-                }
-                if (th_ix < N_PART_READ) {
-                    sh_part_count[th_ix] = part_start_ix + count;
-                }
-                barrier();
-                ready_ix = sh_part_count[N_PART_READ - 1];
-                partition_ix += N_PART_READ;
-            }
-            // use binary search to find element to read
-            uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix && mem_ok) {
-                uint part_ix = 0;
-                for (uint i = 0; i < LG_N_PART_READ; i++) {
-                    uint probe = part_ix + ((N_PART_READ / 2) >> i);
-                    if (ix >= sh_part_count[probe - 1]) {
-                        part_ix = probe;
-                    }
-                }
-                ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
-                Alloc bin_alloc = sh_part_elements[part_ix];
-                BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
-                BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
-                sh_elements[th_ix] = inst.element_ix;
-            }
-            barrier();
-
-            wr_ix = min(rd_ix + N_TILE, ready_ix);
-        } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
-
-        // We've done the merge and filled the buffer.
-
-        // Read one element, compute coverage.
-        uint tag = Annotated_Nop;
-        uint element_ix;
-        AnnotatedRef ref;
-        if (th_ix + rd_ix < wr_ix) {
-            element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            tag = Annotated_tag(conf.anno_alloc, ref).tag;
-        }
-
-        // Bounding box of element in pixel coordinates.
-        uint tile_count;
-        switch (tag) {
-        case Annotated_Color:
-        case Annotated_Image:
-        case Annotated_BeginClip:
-        case Annotated_EndClip:
-            // We have one "path" for each element, even if the element isn't
-            // actually a path (currently EndClip, but images etc in the future).
-            uint path_ix = element_ix;
-            Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-            uint stride = path.bbox.z - path.bbox.x;
-            sh_tile_stride[th_ix] = stride;
-            int dx = int(path.bbox.x) - int(bin_tile_x);
-            int dy = int(path.bbox.y) - int(bin_tile_y);
-            int x0 = clamp(dx, 0, N_TILE_X);
-            int y0 = clamp(dy, 0, N_TILE_Y);
-            int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
-            int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
-            sh_tile_width[th_ix] = uint(x1 - x0);
-            sh_tile_x0[th_ix] = x0;
-            sh_tile_y0[th_ix] = y0;
-            tile_count = uint(x1 - x0) * uint(y1 - y0);
-            // base relative to bin
-            uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
-            sh_tile_base[th_ix] = base;
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-            write_tile_alloc(th_ix, path_alloc);
-            break;
-        default:
-            tile_count = 0;
-            break;
-        }
-
-        // Prefix sum of sh_tile_count
-        sh_tile_count[th_ix] = tile_count;
-        for (uint i = 0; i < LG_N_TILE; i++) {
-            barrier();
-            if (th_ix >= (1 << i)) {
-                tile_count += sh_tile_count[th_ix - (1 << i)];
-            }
-            barrier();
-            sh_tile_count[th_ix] = tile_count;
-        }
-        barrier();
-        uint total_tile_count = sh_tile_count[N_TILE - 1];
-        for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
-            // Binary search to find element
-            uint el_ix = 0;
-            for (uint i = 0; i < LG_N_TILE; i++) {
-                uint probe = el_ix + ((N_TILE / 2) >> i);
-                if (ix >= sh_tile_count[probe - 1]) {
-                    el_ix = probe;
-                }
-            }
-            AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
-            uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
-            uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
-            uint width = sh_tile_width[el_ix];
-            uint x = sh_tile_x0[el_ix] + seq_ix % width;
-            uint y = sh_tile_y0[el_ix] + seq_ix / width;
-            bool include_tile = false;
-            if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
-                include_tile = true;
-            } else if (mem_ok) {
-                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
-                // Include the path in the tile if
-                // - the tile contains at least a segment (tile offset non-zero)
-                // - the tile is completely covered (backdrop non-zero)
-                include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
-            }
-            if (include_tile) {
-                uint el_slice = el_ix / 32;
-                uint el_mask = 1 << (el_ix & 31);
-                atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
-            }
-        }
-
-        barrier();
-
-        // Output non-segment elements for this tile. The thread does a sequential walk
-        // through the non-segment elements.
-        uint slice_ix = 0;
-        uint bitmap = sh_bitmaps[0][th_ix];
-        while (mem_ok) {
-            if (bitmap == 0) {
-                slice_ix++;
-                if (slice_ix == N_SLICE) {
-                    break;
-                }
-                bitmap = sh_bitmaps[slice_ix][th_ix];
-                if (bitmap == 0) {
-                    continue;
-                }
-            }
-            uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
-            uint element_ix = sh_elements[element_ref_ix];
-
-            // Clear LSB
-            bitmap &= bitmap - 1;
-
-            // At this point, we read the element again from global memory.
-            // If that turns out to be expensive, maybe we can pack it into
-            // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
-
-            if (clip_zero_depth == 0) {
-                switch (tag.tag) {
-                case Annotated_Color:
-                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
-                    Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
-                    cmd_ref.offset += 4 + CmdColor_size;
-                    break;
-                case Annotated_Image:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
-                    if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                        break;
-                    }
-                    write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth);
-                    Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
-                    cmd_ref.offset += 4 + CmdImage_size;
-                    break;
-                case Annotated_BeginClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
-                        + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
-                    if (tile.tile.offset == 0 && tile.backdrop == 0) {
-                        clip_zero_depth = clip_depth + 1;
-                    } else if (tile.tile.offset == 0 && clip_depth < 32) {
-                        clip_one_mask |= (1 << clip_depth);
-                    } else {
-                        AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                            break;
-                        }
-                        write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth);
-                        Cmd_BeginClip_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
-                        if (clip_depth < 32) {
-                            clip_one_mask &= ~(1 << clip_depth);
-                        }
-                        begin_slot++;
-                        num_begin_slots = max(num_begin_slots, begin_slot);
-                    }
-                    clip_depth++;
-                    break;
-                case Annotated_EndClip:
-                    clip_depth--;
-                    if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
-                        if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
-                            break;
-                        }
-                        Cmd_Solid_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
-                        begin_slot--;
-                        Cmd_EndClip_write(cmd_alloc, cmd_ref);
-                        cmd_ref.offset += 4;
-                    }
-                    break;
-                }
-            } else {
-                // In "clip zero" state, suppress all drawing
-                switch (tag.tag) {
-                case Annotated_BeginClip:
-                    clip_depth++;
-                    break;
-                case Annotated_EndClip:
-                    if (clip_depth == clip_zero_depth) {
-                        clip_zero_depth = 0;
-                    }
-                    clip_depth--;
-                    break;
-                }
-            }
-        }
-        barrier();
-
-        rd_ix += N_TILE;
-        if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
-    }
-    if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
-        Cmd_End_write(cmd_alloc, cmd_ref);
-        if (num_begin_slots > 0) {
-            // Write scratch allocation: one state per BeginClip per rasterizer chunk.
-            uint scratch_size = num_begin_slots * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
-            MallocResult scratch = malloc(scratch_size);
-            // Ignore scratch.failed; we don't use the allocation and kernel4
-            // checks for memory overflow before using it.
-            alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
-        }
-    }
-}
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: Unlicense OR MIT
-
-struct m3x2 {
-	vec3 r0;
-	vec3 r1;
-};
-
-// fboTextureTransform is the transformation
-// that cancels the implied transformation between
-// the framebuffer and its texture.
-// Only two rows are returned. The last is implied
-// to be [0, 0, 1].
-const m3x2 fboTextureTransform = m3x2(
-#ifdef HLSL
-	vec3(1.0, 0.0, 0.0),
-	vec3(0.0, -1.0, 1.0)
-#else
-	vec3(1.0, 0.0, 0.0),
-	vec3(0.0, 1.0, 0.0)
-#endif
-);
-
-// fboTransform is the transformation
-// that cancels the implied transformation between
-// the clip space and the framebuffer.
-// Only two rows are returned. The last is implied
-// to be [0, 0, 1].
-const m3x2 fboTransform = m3x2(
-#ifdef HLSL
-	vec3(1.0, 0.0, 0.0),
-	vec3(0.0, 1.0, 0.0)
-#else
-	vec3(1.0, 0.0, 0.0),
-	vec3(0.0, -1.0, 0.0)
-#endif
-);
-
-// toClipSpace converts an OpenGL gl_Position value to a
-// native GPU position.
-vec4 toClipSpace(vec4 pos) {
-#ifdef HLSL
-	// Map depths to the Direct3D [0; 1] range.
-	return vec4(pos.xy, (pos.z + pos.w)*.5, pos.w);
-#else
-	return pos;
-#endif
-}
-
-vec3 transform3x2(m3x2 t, vec3 v) {
-	return vec3(dot(t.r0, v), dot(t.r1, v), dot(vec3(0.0, 0.0, 1.0), v));
-}
@@ -1,24 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-layout(binding = 0) uniform sampler2D tex;
-
-layout(location = 0) in vec2 vUV;
-
-layout(location = 0) out vec4 fragColor;
-
-vec3 sRGBtoRGB(vec3 rgb) {
-	bvec3 cutoff = greaterThanEqual(rgb, vec3(0.04045));
-	vec3 below = rgb/vec3(12.92);
-	vec3 above = pow((rgb + vec3(0.055))/vec3(1.055), vec3(2.4));
-	return mix(below, above, cutoff);
-}
-
-void main() {
-	vec4 texel = texture(tex, vUV);
-	texel.rgb = sRGBtoRGB(texel.rgb);
-	fragColor = texel;
-}
@@ -1,21 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision highp float;
-
-layout(binding = 0) uniform Block {
-	vec2 scale;
-	vec2 pos;
-	vec2 uvScale;
-} _block;
-
-layout(location = 0) in vec2 pos;
-layout(location = 1) in vec2 uv;
-
-layout(location = 0) out vec2 vUV;
-
-void main() {
-	vUV = uv*_block.uvScale;
-	gl_Position = vec4(pos*_block.scale + _block.pos, 0, 1);
-}
@@ -1,22 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-{{.Header}}
-
-// Use high precision to be pixel accurate for
-// large cover atlases.
-layout(location = 0) in highp vec2 vCoverUV;
-layout(location = 1) in vec2 vUV;
-
-layout(binding = 1) uniform sampler2D cover;
-
-layout(location = 0) out vec4 fragColor;
-
-void main() {
-    fragColor = {{.FetchColorExpr}};
-	float cover = min(abs(texture(cover, vCoverUV).r), 1.0);
-	fragColor *= cover;
-}
@@ -1,31 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-#extension GL_GOOGLE_include_directive : enable
-
-precision highp float;
-
-#include "common.h"
-
-layout(binding = 0) uniform Block {
-	vec4 transform;
-	vec4 uvCoverTransform;
-	vec4 uvTransformR1;
-	vec4 uvTransformR2;
-	float z;
-} _block;
-
-layout(location = 0) in vec2 pos;
-
-layout(location = 0) out vec2 vCoverUV;
-
-layout(location = 1) in vec2 uv;
-layout(location = 1) out vec2 vUV;
-
-void main() {
-    gl_Position = toClipSpace(vec4(pos*_block.transform.xy + _block.transform.zw, _block.z, 1));
-	vUV = transform3x2(m3x2(_block.uvTransformR1.xyz, _block.uvTransformR2.xyz), vec3(uv,1)).xy;
-	vec3 uv3 = transform3x2(fboTextureTransform, vec3(uv, 1.0));
-	vCoverUV = (uv3*vec3(_block.uvCoverTransform.xy, 1.0)+vec3(_block.uvCoverTransform.zw, 0.0)).xy;
-}
@@ -1,410 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// The element processing stage, first in the pipeline.
-//
-// This stage is primarily about applying transforms and computing bounding
-// boxes. It is organized as a scan over the input elements, producing
-// annotated output elements.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define N_ROWS 4
-#define WG_SIZE 32
-#define LG_WG_SIZE 5
-#define PARTITION_SIZE (WG_SIZE * N_ROWS)
-
-layout(local_size_x = WG_SIZE, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(set = 0, binding = 2) readonly buffer SceneBuf {
-    uint[] scene;
-};
-
-// It would be better to use the Vulkan memory model than
-// "volatile" but shooting for compatibility here rather
-// than doing things right.
-layout(set = 0, binding = 3) volatile buffer StateBuf {
-    uint part_counter;
-    uint[] state;
-};
-
-#include "scene.h"
-#include "state.h"
-#include "annotated.h"
-#include "pathseg.h"
-#include "tile.h"
-
-#define StateBuf_stride (4 + 2 * State_size)
-
-StateRef state_aggregate_ref(uint partition_ix) {
-    return StateRef(4 + partition_ix * StateBuf_stride);
-}
-
-StateRef state_prefix_ref(uint partition_ix) {
-    return StateRef(4 + partition_ix * StateBuf_stride + State_size);
-}
-
-uint state_flag_index(uint partition_ix) {
-    return partition_ix * (StateBuf_stride / 4);
-}
-
-// These correspond to X, A, P respectively in the prefix sum paper.
-#define FLAG_NOT_READY 0
-#define FLAG_AGGREGATE_READY 1
-#define FLAG_PREFIX_READY 2
-
-#define FLAG_SET_LINEWIDTH 1
-#define FLAG_SET_BBOX 2
-#define FLAG_RESET_BBOX 4
-#define FLAG_SET_FILL_MODE 8
-// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
-#define LG_FILL_MODE 4
-#define FILL_MODE_BITS 1
-#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)
-
-// This is almost like a monoid (the interaction between transformation and
-// bounding boxes is approximate)
-State combine_state(State a, State b) {
-    State c;
-    c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
-    c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
-    c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
-    c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
-    if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
-        c.bbox = a.bbox;
-    } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
-        (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
-    {
-        c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
-        c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
-    }
-    // It would be more concise to cast to matrix types; ah well.
-    c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
-    c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
-    c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
-    c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
-    c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
-    c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
-    c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
-    c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
-    c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
-    uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
-    fill_mode &= FILL_MODE_MASK;
-    c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
-    c.path_count = a.path_count + b.path_count;
-    c.pathseg_count = a.pathseg_count + b.pathseg_count;
-    c.trans_count = a.trans_count + b.trans_count;
-    return c;
-}
-
-State map_element(ElementRef ref) {
-    // TODO: it would *probably* be more efficient to make the memory read patterns less
-    // divergent, though it would be more wasted memory.
-    uint tag = Element_tag(ref).tag;
-    State c;
-    c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
-    c.mat = vec4(1.0, 0.0, 0.0, 1.0);
-    c.translate = vec2(0.0, 0.0);
-    c.linewidth = 1.0; // TODO should be 0.0
-    c.flags = 0;
-    c.path_count = 0;
-    c.pathseg_count = 0;
-    c.trans_count = 0;
-    switch (tag) {
-    case Element_Line:
-        LineSeg line = Element_Line_read(ref);
-        c.bbox.xy = min(line.p0, line.p1);
-        c.bbox.zw = max(line.p0, line.p1);
-        c.pathseg_count = 1;
-        break;
-    case Element_Quad:
-        QuadSeg quad = Element_Quad_read(ref);
-        c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
-        c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
-        c.pathseg_count = 1;
-        break;
-    case Element_Cubic:
-        CubicSeg cubic = Element_Cubic_read(ref);
-        c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
-        c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
-        c.pathseg_count = 1;
-        break;
-    case Element_FillColor:
-    case Element_FillImage:
-    case Element_BeginClip:
-        c.flags = FLAG_RESET_BBOX;
-        c.path_count = 1;
-        break;
-    case Element_EndClip:
-        c.path_count = 1;
-        break;
-    case Element_SetLineWidth:
-        SetLineWidth lw = Element_SetLineWidth_read(ref);
-        c.linewidth = lw.width;
-        c.flags = FLAG_SET_LINEWIDTH;
-        break;
-    case Element_Transform:
-        Transform t = Element_Transform_read(ref);
-        c.mat = t.mat;
-        c.translate = t.translate;
-        c.trans_count = 1;
-        break;
-    case Element_SetFillMode:
-        SetFillMode fm = Element_SetFillMode_read(ref);
-        c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
-        break;
-    }
-    return c;
-}
-
-// Get the bounding box of a circle transformed by the matrix into an ellipse.
-vec2 get_linewidth(State st) {
-    // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
-    return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
-}
-
-shared State sh_state[WG_SIZE];
-
-shared uint sh_part_ix;
-shared State sh_prefix;
-
-void main() {
-    State th_state[N_ROWS];
-    // Determine partition to process by atomic counter (described in Section
-    // 4.4 of prefix sum paper).
-    if (gl_LocalInvocationID.x == 0) {
-        sh_part_ix = atomicAdd(part_counter, 1);
-    }
-    barrier();
-    uint part_ix = sh_part_ix;
-
-    uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
-    ElementRef ref = ElementRef(ix * Element_size);
-
-    th_state[0] = map_element(ref);
-    for (uint i = 1; i < N_ROWS; i++) {
-        // discussion question: would it be faster to load using more coherent patterns
-        // into thread memory? This is kinda strided.
-        th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
-    }
-    State agg = th_state[N_ROWS - 1];
-    sh_state[gl_LocalInvocationID.x] = agg;
-    for (uint i = 0; i < LG_WG_SIZE; i++) {
-        barrier();
-        if (gl_LocalInvocationID.x >= (1 << i)) {
-            State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
-            agg = combine_state(other, agg);
-        }
-        barrier();
-        sh_state[gl_LocalInvocationID.x] = agg;
-    }
-
-    State exclusive;
-    exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
-    exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
-    exclusive.translate = vec2(0.0, 0.0);
-    exclusive.linewidth = 1.0; //TODO should be 0.0
-    exclusive.flags = 0;
-    exclusive.path_count = 0;
-    exclusive.pathseg_count = 0;
-    exclusive.trans_count = 0;
-
-    // Publish aggregate for this partition
-    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
-        // Note: with memory model, we'd want to generate the atomic store version of this.
-        State_write(state_aggregate_ref(part_ix), agg);
-        uint flag = FLAG_AGGREGATE_READY;
-        memoryBarrierBuffer();
-        if (part_ix == 0) {
-            State_write(state_prefix_ref(part_ix), agg);
-            flag = FLAG_PREFIX_READY;
-        }
-        state[state_flag_index(part_ix)] = flag;
-        if (part_ix != 0) {
-            // step 4 of paper: decoupled lookback
-            uint look_back_ix = part_ix - 1;
-
-            State their_agg;
-            uint their_ix = 0;
-            while (true) {
-                flag = state[state_flag_index(look_back_ix)];
-                if (flag == FLAG_PREFIX_READY) {
-                    State their_prefix = State_read(state_prefix_ref(look_back_ix));
-                    exclusive = combine_state(their_prefix, exclusive);
-                    break;
-                } else if (flag == FLAG_AGGREGATE_READY) {
-                    their_agg = State_read(state_aggregate_ref(look_back_ix));
-                    exclusive = combine_state(their_agg, exclusive);
-                    look_back_ix--;
-                    their_ix = 0;
-                    continue;
-                }
-                // else spin
-
-                // Unfortunately there's no guarantee of forward progress of other
-                // workgroups, so compute a bit of the aggregate before trying again.
-                // In the worst case, spinning stops when the aggregate is complete.
-                ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
-                State s = map_element(ref);
-                if (their_ix == 0) {
-                    their_agg = s;
-                } else {
-                    their_agg = combine_state(their_agg, s);
-                }
-                their_ix++;
-                if (their_ix == PARTITION_SIZE) {
-                    exclusive = combine_state(their_agg, exclusive);
-                    if (look_back_ix == 0) {
-                        break;
-                    }
-                    look_back_ix--;
-                    their_ix = 0;
-                }
-            }
-
-            // step 5 of paper: compute inclusive prefix
-            State inclusive_prefix = combine_state(exclusive, agg);
-            sh_prefix = exclusive;
-            State_write(state_prefix_ref(part_ix), inclusive_prefix);
-            memoryBarrierBuffer();
-            flag = FLAG_PREFIX_READY;
-            state[state_flag_index(part_ix)] = flag;
-        }
-    }
-    barrier();
-    if (part_ix != 0) {
-        exclusive = sh_prefix;
-    }
-
-    State row = exclusive;
-    if (gl_LocalInvocationID.x > 0) {
-        State other = sh_state[gl_LocalInvocationID.x - 1];
-        row = combine_state(row, other);
-    }
-    for (uint i = 0; i < N_ROWS; i++) {
-        State st = combine_state(row, th_state[i]);
-
-        // Here we read again from the original scene. There may be
-        // gains to be had from stashing in shared memory or possibly
-        // registers (though register pressure is an issue).
-        ElementRef this_ref = Element_index(ref, i);
-        ElementTag tag = Element_tag(this_ref);
-        uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
-        bool is_stroke = fill_mode == MODE_STROKE;
-        switch (tag.tag) {
-        case Element_Line:
-            LineSeg line = Element_Line_read(this_ref);
-            PathCubic path_cubic;
-            path_cubic.p0 = line.p0;
-            path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
-            path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
-            path_cubic.p3 = line.p1;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_Quad:
-            QuadSeg quad = Element_Quad_read(this_ref);
-            path_cubic.p0 = quad.p0;
-            path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
-            path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
-            path_cubic.p3 = quad.p2;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_Cubic:
-            CubicSeg cubic = Element_Cubic_read(this_ref);
-            path_cubic.p0 = cubic.p0;
-            path_cubic.p1 = cubic.p1;
-            path_cubic.p2 = cubic.p2;
-            path_cubic.p3 = cubic.p3;
-            path_cubic.path_ix = st.path_count;
-            path_cubic.trans_ix = st.trans_count;
-            if (is_stroke) {
-                path_cubic.stroke = get_linewidth(st);
-            } else {
-                path_cubic.stroke = vec2(0.0);
-            }
-            path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
-            PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
-            break;
-        case Element_FillColor:
-            FillColor fill = Element_FillColor_read(this_ref);
-            AnnoColor anno_fill;
-            anno_fill.rgba_color = fill.rgba_color;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_fill.bbox = st.bbox + vec4(-lw, lw);
-                anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_fill.bbox = st.bbox;
-                anno_fill.linewidth = 0.0;
-            }
-            AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
-            break;
-        case Element_FillImage:
-            FillImage fill_img = Element_FillImage_read(this_ref);
-            AnnoImage anno_img;
-            anno_img.index = fill_img.index;
-            anno_img.offset = fill_img.offset;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_img.bbox = st.bbox + vec4(-lw, lw);
-                anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_img.bbox = st.bbox;
-                anno_img.linewidth = 0.0;
-            }
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
-            break;
-        case Element_BeginClip:
-            Clip begin_clip = Element_BeginClip_read(this_ref);
-            AnnoBeginClip anno_begin_clip;
-            // This is the absolute bbox, it's been transformed during encoding.
-            anno_begin_clip.bbox = begin_clip.bbox;
-            if (is_stroke) {
-                vec2 lw = get_linewidth(st);
-                anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            } else {
-                anno_fill.linewidth = 0.0;
-            }
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
-            break;
-        case Element_EndClip:
-            Clip end_clip = Element_EndClip_read(this_ref);
-            // This bbox is expected to be the same as the begin one.
-            AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
-            out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
-            Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
-            break;
-        case Element_Transform:
-            TransformSeg transform = TransformSeg(st.mat, st.translate);
-            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
-            TransformSeg_write(conf.trans_alloc, trans_ref, transform);
-            break;
-        }
-    }
-}
@@ -1,18 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-// Use high precision to be pixel accurate for
-// large cover atlases.
-layout(location = 0) in highp vec2 vUV;
-
-layout(binding = 0) uniform sampler2D cover;
-
-layout(location = 0) out vec4 fragColor;
-
-void main() {
-  float cover = abs(texture(cover, vUV).r);
-  fragColor.r = cover;
-}
@@ -1,28 +0,0 @@
-#version 310 es
-  
-// SPDX-License-Identifier: Unlicense OR MIT
-
-#extension GL_GOOGLE_include_directive : enable
-
-precision highp float;
-
-#include "common.h"
-
-layout(location = 0) in vec2 pos;
-layout(location = 1) in vec2 uv;
-
-layout(binding = 0) uniform Block {
-	vec4 uvTransform;
-	vec4 subUVTransform;
-} _block;
-
-layout(location = 0) out vec2 vUV;
-
-void main() {
-  vec3 p = transform3x2(fboTransform, vec3(pos, 1.0));
-  gl_Position = vec4(p, 1);
-  vec3 uv3 = transform3x2(fboTextureTransform, vec3(uv, 1.0));
-  vUV = uv3.xy*_block.subUVTransform.xy + _block.subUVTransform.zw;
-  vUV = transform3x2(fboTextureTransform, vec3(vUV, 1.0)).xy;
-  vUV = vUV*_block.uvTransform.xy + _block.uvTransform.zw;
-}
@@ -1,248 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// This is "kernel 4" in a 4-kernel pipeline. It renders the commands
-// in the per-tile command list to an image.
-
-// Right now, this kernel stores the image in a buffer, but a better
-// plan is to use a texture. This is because of limited support.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-#ifdef ENABLE_IMAGE_INDICES
-#extension GL_EXT_nonuniform_qualifier : enable
-#endif
-
-#include "mem.h"
-#include "setup.h"
-
-#define CHUNK_X 2
-#define CHUNK_Y 4
-#define CHUNK CHUNK_X * CHUNK_Y
-#define CHUNK_DX (TILE_WIDTH_PX / CHUNK_X)
-#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
-layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
-
-layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf {
-    Config conf;
-};
-
-layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image;
-
-#ifdef ENABLE_IMAGE_INDICES
-layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[];
-#else
-layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D images[1];
-#endif
-
-#include "ptcl.h"
-#include "tile.h"
-
-mediump vec3 tosRGB(mediump vec3 rgb) {
-    bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
-    mediump vec3 below = vec3(12.92)*rgb;
-    mediump vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
-    return mix(below, above, cutoff);
-}
-
-mediump vec3 fromsRGB(mediump vec3 srgb) {
-    // Formula from EXT_sRGB.
-    bvec3 cutoff = greaterThanEqual(srgb, vec3(0.04045));
-    mediump vec3 below = srgb/vec3(12.92);
-    mediump vec3 above = pow((srgb + vec3(0.055))/vec3(1.055), vec3(2.4));
-    return mix(below, above, cutoff);
-}
-
-// unpacksRGB unpacks a color in the sRGB color space to a vec4 in the linear color
-// space.
-mediump vec4 unpacksRGB(uint srgba) {
-    mediump vec4 color = unpackUnorm4x8(srgba).wzyx;
-    return vec4(fromsRGB(color.rgb), color.a);
-}
-
-// packsRGB packs a color in the linear color space into its 8-bit sRGB equivalent.
-uint packsRGB(mediump vec4 rgba) {
-    rgba = vec4(tosRGB(rgba.rgb), rgba.a);
-    return packUnorm4x8(rgba.wzyx);
-}
-
-uvec2 chunk_offset(uint i) {
-    return uvec2(i % CHUNK_X * CHUNK_DX, i / CHUNK_X * CHUNK_DY);
-}
-
-mediump vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
-    mediump vec4 rgba[CHUNK];
-    for (uint i = 0; i < CHUNK; i++) {
-        ivec2 uv = ivec2(xy + chunk_offset(i)) + cmd_img.offset;
-        mediump vec4 fg_rgba;
-#ifdef ENABLE_IMAGE_INDICES
-        fg_rgba = imageLoad(images[cmd_img.index], uv);
-#else
-        fg_rgba = imageLoad(images[0], uv);
-#endif
-        fg_rgba.rgb = fromsRGB(fg_rgba.rgb);
-        rgba[i] = fg_rgba;
-    }
-    return rgba;
-}
-
-void main() {
-    uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
-    Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
-    CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
-
-    // Read scrach space allocation, written first in the command list.
-    Alloc scratch_alloc = alloc_read(cmd_alloc, cmd_ref.offset);
-    cmd_ref.offset += Alloc_size;
-
-    uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
-    vec2 xy = vec2(xy_uint);
-    mediump vec4 rgba[CHUNK];
-    for (uint i = 0; i < CHUNK; i++) {
-        rgba[i] = vec4(0.0);
-        // TODO: remove this debug image support when the actual image method is plumbed.
-#ifdef DEBUG_IMAGES
-#ifdef ENABLE_IMAGE_INDICES
-        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
-            rgba[i] = imageLoad(images[gl_WorkGroupID.x / 64], ivec2(xy_uint + chunk_offset(i))/4);
-        }
-#else
-        if (xy_uint.x < 1024 && xy_uint.y < 1024) {
-            rgb[i] = imageLoad(images[0], ivec2(xy_uint + chunk_offset(i))/4).rgb;
-        }
-#endif
-#endif
-    }
-
-    mediump float area[CHUNK];
-    uint clip_depth = 0;
-    bool mem_ok = mem_error == NO_ERROR;
-    while (mem_ok) {
-        uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
-        if (tag == Cmd_End) {
-            break;
-        }
-        switch (tag) {
-        case Cmd_Stroke:
-            // Calculate distance field from all the line segments in this tile.
-            CmdStroke stroke = Cmd_Stroke_read(cmd_alloc, cmd_ref);
-            mediump float df[CHUNK];
-            for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
-            TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
-            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
-                vec2 line_vec = seg.vector;
-                for (uint k = 0; k < CHUNK; k++) {
-                    vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
-                    dpos += vec2(chunk_offset(k));
-                    float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
-                    df[k] = min(df[k], length(line_vec * t - dpos));
-                }
-                tile_seg_ref = seg.next;
-            } while (tile_seg_ref.offset != 0);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
-            }
-            cmd_ref.offset += 4 + CmdStroke_size;
-            break;
-        case Cmd_Fill:
-            CmdFill fill = Cmd_Fill_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
-            tile_seg_ref = TileSegRef(fill.tile_ref);
-            // Calculate coverage based on backdrop + coverage of each line segment
-            do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
-                for (uint k = 0; k < CHUNK; k++) {
-                    vec2 my_xy = xy + vec2(chunk_offset(k));
-                    vec2 start = seg.origin - my_xy;
-                    vec2 end = start + seg.vector;
-                    vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
-                    if (window.x != window.y) {
-                        vec2 t = (window - start.y) / seg.vector.y;
-                        vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
-                        float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
-                        float xmax = max(xs.x, xs.y);
-                        float b = min(xmax, 1.0);
-                        float c = max(b, 0.0);
-                        float d = max(xmin, 0.0);
-                        float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
-                        area[k] += a * (window.x - window.y);
-                    }
-                    area[k] += sign(seg.vector.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
-                }
-                tile_seg_ref = seg.next;
-            } while (tile_seg_ref.offset != 0);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = min(abs(area[k]), 1.0);
-            }
-            cmd_ref.offset += 4 + CmdFill_size;
-            break;
-        case Cmd_Solid:
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = 1.0;
-            }
-            cmd_ref.offset += 4;
-            break;
-        case Cmd_Alpha:
-            CmdAlpha alpha = Cmd_Alpha_read(cmd_alloc, cmd_ref);
-            for (uint k = 0; k < CHUNK; k++) {
-                area[k] = alpha.alpha;
-            }
-            cmd_ref.offset += 4 + CmdAlpha_size;
-            break;
-        case Cmd_Color:
-            CmdColor color = Cmd_Color_read(cmd_alloc, cmd_ref);
-            mediump vec4 fg = unpacksRGB(color.rgba_color);
-            for (uint k = 0; k < CHUNK; k++) {
-                mediump vec4 fg_k = fg * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdColor_size;
-            break;
-        case Cmd_Image:
-            CmdImage fill_img = Cmd_Image_read(cmd_alloc, cmd_ref);
-            mediump vec4 img[CHUNK] = fillImage(xy_uint, fill_img);
-            for (uint k = 0; k < CHUNK; k++) {
-                mediump vec4 fg_k = img[k] * area[k];
-                rgba[k] = rgba[k] * (1.0 - fg_k.a) + fg_k;
-            }
-            cmd_ref.offset += 4 + CmdImage_size;
-            break;
-        case Cmd_BeginClip:
-            uint base_ix = (scratch_alloc.offset >> 2) + CLIP_STATE_SIZE * (clip_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX +
-                gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y);
-            for (uint k = 0; k < CHUNK; k++) {
-                uvec2 offset = chunk_offset(k);
-                uint srgb = packsRGB(vec4(rgba[k]));
-                mediump float alpha = clamp(abs(area[k]), 0.0, 1.0);
-                write_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), srgb);
-                write_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX), floatBitsToUint(alpha));
-                rgba[k] = vec4(0.0);
-            }
-            clip_depth++;
-            cmd_ref.offset += 4;
-            break;
-        case Cmd_EndClip:
-            clip_depth--;
-            base_ix = (scratch_alloc.offset >> 2) + CLIP_STATE_SIZE * (clip_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX +
-                gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y);
-            for (uint k = 0; k < CHUNK; k++) {
-                uvec2 offset = chunk_offset(k);
-                uint srgb = read_mem(scratch_alloc, base_ix + 0 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX));
-                uint alpha = read_mem(scratch_alloc, base_ix + 1 + CLIP_STATE_SIZE * (offset.x + offset.y * TILE_WIDTH_PX));
-                mediump vec4 bg = unpacksRGB(srgb);
-                mediump vec4 fg = rgba[k] * area[k] * uintBitsToFloat(alpha);
-                rgba[k] = bg * (1.0 - fg.a) + fg;
-            }
-            cmd_ref.offset += 4;
-            break;
-        case Cmd_Jump:
-            cmd_ref = CmdRef(Cmd_Jump_read(cmd_alloc, cmd_ref).new_ref);
-            cmd_alloc.offset = cmd_ref.offset;
-            break;
-        }
-    }
-
-    for (uint i = 0; i < CHUNK; i++) {
-        imageStore(image, ivec2(xy_uint + chunk_offset(i)), vec4(tosRGB(rgba[i].rgb), rgba[i].a));
-    }
-}
@@ -1,32 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-layout(binding = 0) uniform sampler2D tex;
-
-layout(location = 0) in vec2 vUV;
-
-layout(location = 0) out vec4 fragColor;
-
-layout(binding=0) uniform Color {
-	// If emulateSRGB is set (!= 0), the input texels are sRGB encoded. We save the
-	// conversion step below, at the cost of texture filtering in sRGB space.
-	float emulateSRGB;
-};
-
-vec3 RGBtosRGB(vec3 rgb) {
-	bvec3 cutoff = greaterThanEqual(rgb, vec3(0.0031308));
-	vec3 below = vec3(12.92)*rgb;
-	vec3 above = vec3(1.055)*pow(rgb, vec3(0.41666)) - vec3(0.055);
-	return mix(below, above, cutoff);
-}
-
-void main() {
-	vec4 texel = texture(tex, vUV);
-	if (emulateSRGB == 0.0) {
-		texel.rgb = RGBtosRGB(texel.rgb);
-	}
-	fragColor = texel;
-}
@@ -1,20 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision highp float;
-
-layout(binding = 0) uniform Block {
-	vec2 scale;
-	vec2 pos;
-} _block;
-
-layout(location = 0) in vec2 pos;
-layout(location = 1) in vec2 uv;
-
-layout(location = 0) out vec2 vUV;
-
-void main() {
-	vUV = uv;
-	gl_Position = vec4(pos*_block.scale + _block.pos, 0, 1);
-}
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-layout(set = 0, binding = 0) buffer Memory {
-    // offset into memory of the next allocation, initialized by the user.
-    uint mem_offset;
-    // mem_error tracks the status of memory accesses, initialized to NO_ERROR
-    // by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
-    // If MEM_DEBUG is defined the following errors are reported:
-    // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
-    // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
-    uint mem_error;
-    uint[] memory;
-};
-
-// Uncomment this line to add the size field to Alloc and enable memory checks.
-// Note that the Config struct in setup.h grows size fields as well.
-//#define MEM_DEBUG
-
-#define NO_ERROR 0
-#define ERR_MALLOC_FAILED 1
-#define ERR_OUT_OF_BOUNDS 2
-#define ERR_UNALIGNED_ACCESS 3
-
-#ifdef MEM_DEBUG
-#define Alloc_size 16
-#else
-#define Alloc_size 8
-#endif
-
-// Alloc represents a memory allocation.
-struct Alloc {
-    // offset in bytes into memory.
-    uint offset;
-#ifdef MEM_DEBUG
-    // size in bytes of the allocation.
-    uint size;
-#endif
-};
-
-struct MallocResult {
-    Alloc alloc;
-    // failed is true if the allocation overflowed memory.
-    bool failed;
-};
-
-// new_alloc synthesizes an Alloc from an offset and size.
-Alloc new_alloc(uint offset, uint size, bool mem_ok) {
-    Alloc a;
-    a.offset = offset;
-#ifdef MEM_DEBUG
-    if (mem_ok) {
-        a.size = size;
-    } else {
-        a.size = 0;
-    }
-#endif
-    return a;
-}
-
-// malloc allocates size bytes of memory.
-MallocResult malloc(uint size) {
-    MallocResult r;
-    uint offset = atomicAdd(mem_offset, size);
-    r.failed = offset + size > memory.length() * 4;
-    r.alloc = new_alloc(offset, size, !r.failed);
-    if (r.failed) {
-        atomicMax(mem_error, ERR_MALLOC_FAILED);
-        return r;
-    }
-#ifdef MEM_DEBUG
-    if ((size & 3) != 0) {
-        r.failed = true;
-        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
-        return r;
-    }
-#endif
-    return r;
-}
-
-// touch_mem checks whether access to the memory word at offset is valid.
-// If MEM_DEBUG is defined, touch_mem returns false if offset is out of bounds.
-// Offset is in words.
-bool touch_mem(Alloc alloc, uint offset) {
-#ifdef MEM_DEBUG
-    if (offset < alloc.offset/4 || offset >= (alloc.offset + alloc.size)/4) {
-        atomicMax(mem_error, ERR_OUT_OF_BOUNDS);
-        return false;
-    }
-#endif
-    return true;
-}
-
-// write_mem writes val to memory at offset.
-// Offset is in words.
-void write_mem(Alloc alloc, uint offset, uint val) {
-    if (!touch_mem(alloc, offset)) {
-        return;
-    }
-    memory[offset] = val;
-}
-
-// read_mem reads the value from memory at offset.
-// Offset is in words.
-uint read_mem(Alloc alloc, uint offset) {
-    if (!touch_mem(alloc, offset)) {
-        return 0;
-    }
-    uint v = memory[offset];
-    return v;
-}
-
-// slice_mem returns a sub-allocation inside another. Offset and size are in
-// bytes, relative to a.offset.
-Alloc slice_mem(Alloc a, uint offset, uint size) {
-#ifdef MEM_DEBUG
-    if ((offset & 3) != 0 || (size & 3) != 0) {
-        atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
-        return Alloc(0, 0);
-    }
-    if (offset + size > a.size) {
-        // slice_mem is sometimes used for slices outside bounds,
-        // but never written.
-        return Alloc(0, 0);
-    }
-    return Alloc(a.offset + offset, size);
-#else
-    return Alloc(a.offset + offset);
-#endif
-}
-
-// alloc_write writes alloc to memory at offset bytes.
-void alloc_write(Alloc a, uint offset, Alloc alloc) {
-    write_mem(a, offset >> 2, alloc.offset);
-#ifdef MEM_DEBUG
-    write_mem(a, (offset >> 2) + 1, alloc.size);
-#endif
-}
-
-// alloc_read reads an Alloc from memory at offset bytes.
-Alloc alloc_read(Alloc a, uint offset) {
-    Alloc alloc;
-    alloc.offset = read_mem(a, offset >> 2);
-#ifdef MEM_DEBUG
-    alloc.size = read_mem(a, (offset >> 2) + 1);
-#endif
-    return alloc;
-}
@@ -1,294 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Coarse rasterization of path segments.
-
-// Allocation and initialization of tiles for paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_COARSE_WG 5
-#define COARSE_WG (1 << LG_COARSE_WG)
-
-layout(local_size_x = COARSE_WG, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "pathseg.h"
-#include "tile.h"
-
-// scale factors useful for converting coordinates to tiles
-#define SX (1.0 / float(TILE_WIDTH_PX))
-#define SY (1.0 / float(TILE_HEIGHT_PX))
-
-#define ACCURACY 0.25
-#define Q_ACCURACY (ACCURACY * 0.1)
-#define REM_ACCURACY (ACCURACY - Q_ACCURACY)
-#define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
-#define MAX_QUADS 16
-
-vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
-    float mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
-    float mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
-struct SubdivResult {
-    float val;
-    float a0;
-    float a2;
-};
-
-/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
-///
-/// This is used for flattening curves.
-#define D 0.67
-float approx_parabola_integral(float x) {
-    return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
-}
-
-/// An approximation to the inverse parabola integral.
-#define B 0.39
-float approx_parabola_inv_integral(float x) {
-    return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
-}
-
-SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
-    vec2 d01 = p1 - p0;
-    vec2 d12 = p2 - p1;
-    vec2 dd = d01 - d12;
-    float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
-    float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
-    float scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    float a0 = approx_parabola_integral(x0);
-    float a2 = approx_parabola_integral(x2);
-    float val = 0.0;
-    if (scale < 1e9) {
-        float da = abs(a2 - a0);
-        float sqrt_scale = sqrt(scale);
-        if (sign(x0) == sign(x2)) {
-            val = da * sqrt_scale;
-        } else {
-            float xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol * da / approx_parabola_integral(xmin);
-        }
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-void main() {
-    uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
-
-    PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
-    if (element_ix < conf.n_pathseg) {
-        tag = PathSeg_tag(conf.pathseg_alloc, ref);
-    }
-    bool mem_ok = mem_error == NO_ERROR;
-    switch (tag.tag) {
-    case PathSeg_Cubic:
-        PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
-
-        uint trans_ix = cubic.trans_ix;
-        if (trans_ix > 0) {
-            TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
-            TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
-            cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
-            cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
-            cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
-            cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
-        }
-
-        vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
-        float err = err_v.x * err_v.x + err_v.y * err_v.y;
-        // The number of quadratics.
-        uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
-        n_quads = min(n_quads, MAX_QUADS);
-        SubdivResult keep_params[MAX_QUADS];
-        // Iterate over quadratics and tote up the estimated number of segments.
-        float val = 0.0;
-        vec2 qp0 = cubic.p0;
-        float step = 1.0 / float(n_quads);
-        for (uint i = 0; i < n_quads; i++) {
-            float t = float(i + 1) * step;
-            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
-            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
-            keep_params[i] = params;
-            val += params.val;
-
-            qp0 = qp2;
-        }
-        uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
-
-        bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
-        uint path_ix = cubic.path_ix;
-        Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-        ivec4 bbox = ivec4(path.bbox);
-        vec2 p0 = cubic.p0;
-        qp0 = cubic.p0;
-        float v_step = val / float(n);
-        int n_out = 1;
-        float val_sum = 0.0;
-        for (uint i = 0; i < n_quads; i++) {
-            float t = float(i + 1) * step;
-            vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
-            vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
-            qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-            SubdivResult params = keep_params[i];
-            float u0 = approx_parabola_inv_integral(params.a0);
-            float u2 = approx_parabola_inv_integral(params.a2);
-            float uscale = 1.0 / (u2 - u0);
-            float target = float(n_out) * v_step;
-            while (n_out == n || target < val_sum + params.val) {
-                vec2 p1;
-                if (n_out == n) {
-                    p1 = cubic.p3;
-                } else {
-                    float u = (target - val_sum) / params.val;
-                    float a = mix(params.a0, params.a2, u);
-                    float au = approx_parabola_inv_integral(a);
-                    float t = (au - u0) * uscale;
-                    p1 = eval_quad(qp0, qp1, qp2, t);
-                }
-
-                // Output line segment
-
-                // Bounding box of element in pixel coordinates.
-                float xmin = min(p0.x, p1.x) - cubic.stroke.x;
-                float xmax = max(p0.x, p1.x) + cubic.stroke.x;
-                float ymin = min(p0.y, p1.y) - cubic.stroke.y;
-                float ymax = max(p0.y, p1.y) + cubic.stroke.y;
-                float dx = p1.x - p0.x;
-                float dy = p1.y - p0.y;
-                // Set up for per-scanline coverage formula, below.
-                float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
-                float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
-                float b = invslope; // Note: assumes square tiles, otherwise scale.
-                float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
-
-                int x0 = int(floor(xmin * SX));
-                int x1 = int(floor(xmax * SX) + 1);
-                int y0 = int(floor(ymin * SY));
-                int y1 = int(floor(ymax * SY) + 1);
-
-                x0 = clamp(x0, bbox.x, bbox.z);
-                y0 = clamp(y0, bbox.y, bbox.w);
-                x1 = clamp(x1, bbox.x, bbox.z);
-                y1 = clamp(y1, bbox.y, bbox.w);
-                float xc = a + b * float(y0);
-                int stride = bbox.z - bbox.x;
-                int base = (y0 - bbox.y) * stride - bbox.x;
-                // TODO: can be tighter, use c to bound width
-                uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
-                // Consider using subgroups to aggregate atomic add.
-                MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
-                if (tile_alloc.failed || !mem_ok) {
-                    return;
-                }
-                uint tile_offset = tile_alloc.alloc.offset;
-
-                TileSeg tile_seg;
-
-                int xray = int(floor(p0.x*SX));
-                int last_xray = int(floor(p1.x*SX));
-                if (p0.y > p1.y) {
-                    int tmp = xray;
-                    xray = last_xray;
-                    last_xray = tmp;
-                }
-                for (int y = y0; y < y1; y++) {
-                    float tile_y0 = float(y * TILE_HEIGHT_PX);
-                    int xbackdrop = max(xray + 1, bbox.x);
-                    if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
-                        int backdrop = p1.y < p0.y ? 1 : -1;
-                        TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
-                        uint tile_el = tile_ref.offset >> 2;
-                        if (touch_mem(path_alloc, tile_el + 1)) {
-                            atomicAdd(memory[tile_el + 1], backdrop);
-                        }
-                    }
-
-                    // next_xray is the xray for the next scanline; the line segment intersects
-                    // all tiles between xray and next_xray.
-                    int next_xray = last_xray;
-                    if (y < y1 - 1) {
-                        float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
-                        float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
-                        next_xray = int(floor(x_edge*SX));
-                    }
-
-                    int min_xray = min(xray, next_xray);
-                    int max_xray = max(xray, next_xray);
-                    int xx0 = min(int(floor(xc - c)), min_xray);
-                    int xx1 = max(int(ceil(xc + c)), max_xray + 1);
-                    xx0 = clamp(xx0, x0, x1);
-                    xx1 = clamp(xx1, x0, x1);
-
-                    for (int x = xx0; x < xx1; x++) {
-                        float tile_x0 = float(x * TILE_WIDTH_PX);
-                        TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
-                        uint tile_el = tile_ref.offset >> 2;
-                        uint old = 0;
-                        if (touch_mem(path_alloc, tile_el)) {
-                            old = atomicExchange(memory[tile_el], tile_offset);
-                        }
-                        tile_seg.origin = p0;
-                        tile_seg.vector = p1 - p0;
-                        float y_edge = 0.0;
-                        if (!is_stroke) {
-                            y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
-                            if (min(p0.x, p1.x) < tile_x0) {
-                                vec2 p = vec2(tile_x0, y_edge);
-                                if (p0.x > p1.x) {
-                                    tile_seg.vector = p - p0;
-                                } else {
-                                    tile_seg.origin = p;
-                                    tile_seg.vector = p1 - p;
-                                }
-                                // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
-                                // Nudge zeroes towards the intended sign.
-                                if (tile_seg.vector.x == 0) {
-                                    tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
-                                }
-                            }
-                            if (x <= min_xray || max_xray < x) {
-                                // Reject inconsistent intersections.
-                                y_edge = 1e9;
-                            }
-                        }
-                        tile_seg.y_edge = y_edge;
-                        tile_seg.next.offset = old;
-                        TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
-                        tile_offset += TileSeg_size;
-                    }
-                    xc += b;
-                    base += stride;
-                    xray = next_xray;
-                }
-
-                n_out += 1;
-                target += v_step;
-                p0 = p1;
-            }
-            val_sum += params.val;
-
-            qp0 = qp2;
-        }
-
-        break;
-    }
-}
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct PathCubicRef {
-    uint offset;
-};
-
-struct PathSegRef {
-    uint offset;
-};
-
-struct PathCubic {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-    vec2 p3;
-    uint path_ix;
-    uint trans_ix;
-    vec2 stroke;
-};
-
-#define PathCubic_size 48
-
-PathCubicRef PathCubic_index(PathCubicRef ref, uint index) {
-    return PathCubicRef(ref.offset + index * PathCubic_size);
-}
-
-#define PathSeg_Nop 0
-#define PathSeg_Cubic 1
-#define PathSeg_size 52
-
-PathSegRef PathSeg_index(PathSegRef ref, uint index) {
-    return PathSegRef(ref.offset + index * PathSeg_size);
-}
-
-struct PathSegTag {
-   uint tag;
-   uint flags;
-};
-
-PathCubic PathCubic_read(Alloc a, PathCubicRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    uint raw6 = read_mem(a, ix + 6);
-    uint raw7 = read_mem(a, ix + 7);
-    uint raw8 = read_mem(a, ix + 8);
-    uint raw9 = read_mem(a, ix + 9);
-    uint raw10 = read_mem(a, ix + 10);
-    uint raw11 = read_mem(a, ix + 11);
-    PathCubic s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
-    s.path_ix = raw8;
-    s.trans_ix = raw9;
-    s.stroke = vec2(uintBitsToFloat(raw10), uintBitsToFloat(raw11));
-    return s;
-}
-
-void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.p0.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.p0.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.p1.x));
-    write_mem(a, ix + 3, floatBitsToUint(s.p1.y));
-    write_mem(a, ix + 4, floatBitsToUint(s.p2.x));
-    write_mem(a, ix + 5, floatBitsToUint(s.p2.y));
-    write_mem(a, ix + 6, floatBitsToUint(s.p3.x));
-    write_mem(a, ix + 7, floatBitsToUint(s.p3.y));
-    write_mem(a, ix + 8, s.path_ix);
-    write_mem(a, ix + 9, s.trans_ix);
-    write_mem(a, ix + 10, floatBitsToUint(s.stroke.x));
-    write_mem(a, ix + 11, floatBitsToUint(s.stroke.y));
-}
-
-PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return PathSegTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) {
-    return PathCubic_read(a, PathCubicRef(ref.offset + 4));
-}
-
-void PathSeg_Nop_write(Alloc a, PathSegRef ref) {
-    write_mem(a, ref.offset >> 2, PathSeg_Nop);
-}
-
-void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) {
-    write_mem(a, ref.offset >> 2, (flags << 16) | PathSeg_Cubic);
-    PathCubic_write(a, PathCubicRef(ref.offset + 4), s);
-}
-
@@ -1,278 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct CmdStrokeRef {
-    uint offset;
-};
-
-struct CmdFillRef {
-    uint offset;
-};
-
-struct CmdColorRef {
-    uint offset;
-};
-
-struct CmdImageRef {
-    uint offset;
-};
-
-struct CmdAlphaRef {
-    uint offset;
-};
-
-struct CmdJumpRef {
-    uint offset;
-};
-
-struct CmdRef {
-    uint offset;
-};
-
-struct CmdStroke {
-    uint tile_ref;
-    float half_width;
-};
-
-#define CmdStroke_size 8
-
-CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
-    return CmdStrokeRef(ref.offset + index * CmdStroke_size);
-}
-
-struct CmdFill {
-    uint tile_ref;
-    int backdrop;
-};
-
-#define CmdFill_size 8
-
-CmdFillRef CmdFill_index(CmdFillRef ref, uint index) {
-    return CmdFillRef(ref.offset + index * CmdFill_size);
-}
-
-struct CmdColor {
-    uint rgba_color;
-};
-
-#define CmdColor_size 4
-
-CmdColorRef CmdColor_index(CmdColorRef ref, uint index) {
-    return CmdColorRef(ref.offset + index * CmdColor_size);
-}
-
-struct CmdImage {
-    uint index;
-    ivec2 offset;
-};
-
-#define CmdImage_size 8
-
-CmdImageRef CmdImage_index(CmdImageRef ref, uint index) {
-    return CmdImageRef(ref.offset + index * CmdImage_size);
-}
-
-struct CmdAlpha {
-    float alpha;
-};
-
-#define CmdAlpha_size 4
-
-CmdAlphaRef CmdAlpha_index(CmdAlphaRef ref, uint index) {
-    return CmdAlphaRef(ref.offset + index * CmdAlpha_size);
-}
-
-struct CmdJump {
-    uint new_ref;
-};
-
-#define CmdJump_size 4
-
-CmdJumpRef CmdJump_index(CmdJumpRef ref, uint index) {
-    return CmdJumpRef(ref.offset + index * CmdJump_size);
-}
-
-#define Cmd_End 0
-#define Cmd_Fill 1
-#define Cmd_Stroke 2
-#define Cmd_Solid 3
-#define Cmd_Alpha 4
-#define Cmd_Color 5
-#define Cmd_Image 6
-#define Cmd_BeginClip 7
-#define Cmd_EndClip 8
-#define Cmd_Jump 9
-#define Cmd_size 12
-
-CmdRef Cmd_index(CmdRef ref, uint index) {
-    return CmdRef(ref.offset + index * Cmd_size);
-}
-
-struct CmdTag {
-   uint tag;
-   uint flags;
-};
-
-CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdStroke s;
-    s.tile_ref = raw0;
-    s.half_width = uintBitsToFloat(raw1);
-    return s;
-}
-
-void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile_ref);
-    write_mem(a, ix + 1, floatBitsToUint(s.half_width));
-}
-
-CmdFill CmdFill_read(Alloc a, CmdFillRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdFill s;
-    s.tile_ref = raw0;
-    s.backdrop = int(raw1);
-    return s;
-}
-
-void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile_ref);
-    write_mem(a, ix + 1, uint(s.backdrop));
-}
-
-CmdColor CmdColor_read(Alloc a, CmdColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.rgba_color);
-}
-
-CmdImage CmdImage_read(Alloc a, CmdImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    CmdImage s;
-    s.index = raw0;
-    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.index);
-    write_mem(a, ix + 1, (uint(s.offset.x) & 0xffff) | (uint(s.offset.y) << 16));
-}
-
-CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdAlpha s;
-    s.alpha = uintBitsToFloat(raw0);
-    return s;
-}
-
-void CmdAlpha_write(Alloc a, CmdAlphaRef ref, CmdAlpha s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.alpha));
-}
-
-CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    CmdJump s;
-    s.new_ref = raw0;
-    return s;
-}
-
-void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.new_ref);
-}
-
-CmdTag Cmd_tag(Alloc a, CmdRef ref) {
-    uint tag_and_flags = read_mem(a, ref.offset >> 2);
-    return CmdTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) {
-    return CmdFill_read(a, CmdFillRef(ref.offset + 4));
-}
-
-CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) {
-    return CmdStroke_read(a, CmdStrokeRef(ref.offset + 4));
-}
-
-CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) {
-    return CmdAlpha_read(a, CmdAlphaRef(ref.offset + 4));
-}
-
-CmdColor Cmd_Color_read(Alloc a, CmdRef ref) {
-    return CmdColor_read(a, CmdColorRef(ref.offset + 4));
-}
-
-CmdImage Cmd_Image_read(Alloc a, CmdRef ref) {
-    return CmdImage_read(a, CmdImageRef(ref.offset + 4));
-}
-
-CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) {
-    return CmdJump_read(a, CmdJumpRef(ref.offset + 4));
-}
-
-void Cmd_End_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_End);
-}
-
-void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) {
-    write_mem(a, ref.offset >> 2, Cmd_Fill);
-    CmdFill_write(a, CmdFillRef(ref.offset + 4), s);
-}
-
-void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) {
-    write_mem(a, ref.offset >> 2, Cmd_Stroke);
-    CmdStroke_write(a, CmdStrokeRef(ref.offset + 4), s);
-}
-
-void Cmd_Solid_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_Solid);
-}
-
-void Cmd_Alpha_write(Alloc a, CmdRef ref, CmdAlpha s) {
-    write_mem(a, ref.offset >> 2, Cmd_Alpha);
-    CmdAlpha_write(a, CmdAlphaRef(ref.offset + 4), s);
-}
-
-void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) {
-    write_mem(a, ref.offset >> 2, Cmd_Color);
-    CmdColor_write(a, CmdColorRef(ref.offset + 4), s);
-}
-
-void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) {
-    write_mem(a, ref.offset >> 2, Cmd_Image);
-    CmdImage_write(a, CmdImageRef(ref.offset + 4), s);
-}
-
-void Cmd_BeginClip_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_BeginClip);
-}
-
-void Cmd_EndClip_write(Alloc a, CmdRef ref) {
-    write_mem(a, ref.offset >> 2, Cmd_EndClip);
-}
-
-void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) {
-    write_mem(a, ref.offset >> 2, Cmd_Jump);
-    CmdJump_write(a, CmdJumpRef(ref.offset + 4), s);
-}
-
@@ -1,313 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct LineSegRef {
-    uint offset;
-};
-
-struct QuadSegRef {
-    uint offset;
-};
-
-struct CubicSegRef {
-    uint offset;
-};
-
-struct FillColorRef {
-    uint offset;
-};
-
-struct FillImageRef {
-    uint offset;
-};
-
-struct SetLineWidthRef {
-    uint offset;
-};
-
-struct TransformRef {
-    uint offset;
-};
-
-struct ClipRef {
-    uint offset;
-};
-
-struct SetFillModeRef {
-    uint offset;
-};
-
-struct ElementRef {
-    uint offset;
-};
-
-struct LineSeg {
-    vec2 p0;
-    vec2 p1;
-};
-
-#define LineSeg_size 16
-
-LineSegRef LineSeg_index(LineSegRef ref, uint index) {
-    return LineSegRef(ref.offset + index * LineSeg_size);
-}
-
-struct QuadSeg {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-};
-
-#define QuadSeg_size 24
-
-QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
-    return QuadSegRef(ref.offset + index * QuadSeg_size);
-}
-
-struct CubicSeg {
-    vec2 p0;
-    vec2 p1;
-    vec2 p2;
-    vec2 p3;
-};
-
-#define CubicSeg_size 32
-
-CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
-    return CubicSegRef(ref.offset + index * CubicSeg_size);
-}
-
-struct FillColor {
-    uint rgba_color;
-};
-
-#define FillColor_size 4
-
-FillColorRef FillColor_index(FillColorRef ref, uint index) {
-    return FillColorRef(ref.offset + index * FillColor_size);
-}
-
-struct FillImage {
-    uint index;
-    ivec2 offset;
-};
-
-#define FillImage_size 8
-
-FillImageRef FillImage_index(FillImageRef ref, uint index) {
-    return FillImageRef(ref.offset + index * FillImage_size);
-}
-
-struct SetLineWidth {
-    float width;
-};
-
-#define SetLineWidth_size 4
-
-SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
-    return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
-}
-
-struct Transform {
-    vec4 mat;
-    vec2 translate;
-};
-
-#define Transform_size 24
-
-TransformRef Transform_index(TransformRef ref, uint index) {
-    return TransformRef(ref.offset + index * Transform_size);
-}
-
-struct Clip {
-    vec4 bbox;
-};
-
-#define Clip_size 16
-
-ClipRef Clip_index(ClipRef ref, uint index) {
-    return ClipRef(ref.offset + index * Clip_size);
-}
-
-struct SetFillMode {
-    uint fill_mode;
-};
-
-#define SetFillMode_size 4
-
-SetFillModeRef SetFillMode_index(SetFillModeRef ref, uint index) {
-    return SetFillModeRef(ref.offset + index * SetFillMode_size);
-}
-
-#define Element_Nop 0
-#define Element_Line 1
-#define Element_Quad 2
-#define Element_Cubic 3
-#define Element_FillColor 4
-#define Element_SetLineWidth 5
-#define Element_Transform 6
-#define Element_BeginClip 7
-#define Element_EndClip 8
-#define Element_FillImage 9
-#define Element_SetFillMode 10
-#define Element_size 36
-
-ElementRef Element_index(ElementRef ref, uint index) {
-    return ElementRef(ref.offset + index * Element_size);
-}
-
-struct ElementTag {
-   uint tag;
-   uint flags;
-};
-
-LineSeg LineSeg_read(LineSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    LineSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-QuadSeg QuadSeg_read(QuadSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    QuadSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-CubicSeg CubicSeg_read(CubicSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    uint raw6 = scene[ix + 6];
-    uint raw7 = scene[ix + 7];
-    CubicSeg s;
-    s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
-    return s;
-}
-
-FillColor FillColor_read(FillColorRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    FillColor s;
-    s.rgba_color = raw0;
-    return s;
-}
-
-FillImage FillImage_read(FillImageRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    FillImage s;
-    s.index = raw0;
-    s.offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
-    return s;
-}
-
-SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    SetLineWidth s;
-    s.width = uintBitsToFloat(raw0);
-    return s;
-}
-
-Transform Transform_read(TransformRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    uint raw4 = scene[ix + 4];
-    uint raw5 = scene[ix + 5];
-    Transform s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-Clip Clip_read(ClipRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    uint raw1 = scene[ix + 1];
-    uint raw2 = scene[ix + 2];
-    uint raw3 = scene[ix + 3];
-    Clip s;
-    s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    return s;
-}
-
-SetFillMode SetFillMode_read(SetFillModeRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = scene[ix + 0];
-    SetFillMode s;
-    s.fill_mode = raw0;
-    return s;
-}
-
-ElementTag Element_tag(ElementRef ref) {
-    uint tag_and_flags = scene[ref.offset >> 2];
-    return ElementTag(tag_and_flags & 0xffff, tag_and_flags >> 16);
-}
-
-LineSeg Element_Line_read(ElementRef ref) {
-    return LineSeg_read(LineSegRef(ref.offset + 4));
-}
-
-QuadSeg Element_Quad_read(ElementRef ref) {
-    return QuadSeg_read(QuadSegRef(ref.offset + 4));
-}
-
-CubicSeg Element_Cubic_read(ElementRef ref) {
-    return CubicSeg_read(CubicSegRef(ref.offset + 4));
-}
-
-FillColor Element_FillColor_read(ElementRef ref) {
-    return FillColor_read(FillColorRef(ref.offset + 4));
-}
-
-SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
-    return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
-}
-
-Transform Element_Transform_read(ElementRef ref) {
-    return Transform_read(TransformRef(ref.offset + 4));
-}
-
-Clip Element_BeginClip_read(ElementRef ref) {
-    return Clip_read(ClipRef(ref.offset + 4));
-}
-
-Clip Element_EndClip_read(ElementRef ref) {
-    return Clip_read(ClipRef(ref.offset + 4));
-}
-
-FillImage Element_FillImage_read(ElementRef ref) {
-    return FillImage_read(FillImageRef(ref.offset + 4));
-}
-
-SetFillMode Element_SetFillMode_read(ElementRef ref) {
-    return SetFillMode_read(SetFillModeRef(ref.offset + 4));
-}
-
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Various constants for the sizes of groups and tiles.
-
-// Much of this will be made dynamic in various ways, but for now it's easiest
-// to hardcode and keep all in one place.
-
-// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
-// maximum workgroup size of 128, or 1 for a maximum size of 256.
-#define LG_WG_FACTOR 0
-#define WG_FACTOR (1<<LG_WG_FACTOR)
-
-#define TILE_WIDTH_PX 32
-#define TILE_HEIGHT_PX 32
-
-#define PTCL_INITIAL_ALLOC 1024
-
-// These should probably be renamed and/or reworked. In the binning
-// kernel, they represent the number of bins. Also, the workgroup size
-// of that kernel is equal to the number of bins, but should probably
-// be more flexible (it's 512 in the K&L paper).
-#define N_TILE_X 16
-#define N_TILE_Y (8 * WG_FACTOR)
-#define N_TILE (N_TILE_X * N_TILE_Y)
-#define LG_N_TILE (7 + LG_WG_FACTOR)
-#define N_SLICE (N_TILE / 32)
-
-struct Config {
-    uint n_elements; // paths
-    uint n_pathseg;
-    uint width_in_tiles;
-    uint height_in_tiles;
-    Alloc tile_alloc;
-    Alloc bin_alloc;
-    Alloc ptcl_alloc;
-    Alloc pathseg_alloc;
-    Alloc anno_alloc;
-    Alloc trans_alloc;
-};
-
-// Fill modes.
-#define MODE_NONZERO 0
-#define MODE_STROKE 1
-
-// Size of kernel4 clip state, in words.
-#define CLIP_STATE_SIZE 2
-
-// fill_mode_from_flags extracts the fill mode from tag flags.
-uint fill_mode_from_flags(uint flags) {
-    return flags & 0x1;
-}
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct StateRef {
-    uint offset;
-};
-
-struct State {
-    vec4 mat;
-    vec2 translate;
-    vec4 bbox;
-    float linewidth;
-    uint flags;
-    uint path_count;
-    uint pathseg_count;
-    uint trans_count;
-};
-
-#define State_size 60
-
-StateRef State_index(StateRef ref, uint index) {
-    return StateRef(ref.offset + index * State_size);
-}
-
-State State_read(StateRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = state[ix + 0];
-    uint raw1 = state[ix + 1];
-    uint raw2 = state[ix + 2];
-    uint raw3 = state[ix + 3];
-    uint raw4 = state[ix + 4];
-    uint raw5 = state[ix + 5];
-    uint raw6 = state[ix + 6];
-    uint raw7 = state[ix + 7];
-    uint raw8 = state[ix + 8];
-    uint raw9 = state[ix + 9];
-    uint raw10 = state[ix + 10];
-    uint raw11 = state[ix + 11];
-    uint raw12 = state[ix + 12];
-    uint raw13 = state[ix + 13];
-    uint raw14 = state[ix + 14];
-    State s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
-    s.linewidth = uintBitsToFloat(raw10);
-    s.flags = raw11;
-    s.path_count = raw12;
-    s.pathseg_count = raw13;
-    s.trans_count = raw14;
-    return s;
-}
-
-void State_write(StateRef ref, State s) {
-    uint ix = ref.offset >> 2;
-    state[ix + 0] = floatBitsToUint(s.mat.x);
-    state[ix + 1] = floatBitsToUint(s.mat.y);
-    state[ix + 2] = floatBitsToUint(s.mat.z);
-    state[ix + 3] = floatBitsToUint(s.mat.w);
-    state[ix + 4] = floatBitsToUint(s.translate.x);
-    state[ix + 5] = floatBitsToUint(s.translate.y);
-    state[ix + 6] = floatBitsToUint(s.bbox.x);
-    state[ix + 7] = floatBitsToUint(s.bbox.y);
-    state[ix + 8] = floatBitsToUint(s.bbox.z);
-    state[ix + 9] = floatBitsToUint(s.bbox.w);
-    state[ix + 10] = floatBitsToUint(s.linewidth);
-    state[ix + 11] = s.flags;
-    state[ix + 12] = s.path_count;
-    state[ix + 13] = s.pathseg_count;
-    state[ix + 14] = s.trans_count;
-}
-
@@ -1,81 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision mediump float;
-
-layout(location=0) in vec2 vFrom;
-layout(location=1) in vec2 vCtrl;
-layout(location=2) in vec2 vTo;
-
-layout(location = 0) out vec4 fragCover;
-
-void main() {
-	float dx = vTo.x - vFrom.x;
-	// Sort from and to in increasing order so the root below
-	// is always the positive square root, if any.
-	// We need the direction of the curve below, so this can't be
-	// done from the vertex shader.
-	bool increasing = vTo.x >= vFrom.x;
-	vec2 left = increasing ? vFrom : vTo;
-	vec2 right = increasing ? vTo : vFrom;
-
-	// The signed horizontal extent of the fragment.
-	vec2 extent = clamp(vec2(vFrom.x, vTo.x), -0.5, 0.5);
-	// Find the t where the curve crosses the middle of the
-	// extent, x₀.
-	// Given the Bézier curve with x coordinates P₀, P₁, P₂
-	// where P₀ is at the origin, its x coordinate in t
-	// is given by:
-	//
-	// x(t) = 2(1-t)tP₁ + t²P₂
-	// 
-	// Rearranging:
-	//
-	// x(t) = (P₂ - 2P₁)t² + 2P₁t
-	//
-	// Setting x(t) = x₀ and using Muller's quadratic formula ("Citardauq")
-	// for robustnesss,
-	//
-	// t = 2x₀/(2P₁±√(4P₁²+4(P₂-2P₁)x₀))
-	//
-	// which simplifies to
-	//
-	// t = x₀/(P₁±√(P₁²+(P₂-2P₁)x₀))
-	//
-	// Setting v = P₂-P₁,
-	//
-	// t = x₀/(P₁±√(P₁²+(v-P₁)x₀))
-	//
-	// t lie in [0; 1]; P₂ ≥ P₁ and P₁ ≥ 0 since we split curves where
-	// the control point lies before the start point or after the end point.
-	// It can then be shown that only the positive square root is valid.
-	float midx = mix(extent.x, extent.y, 0.5);
-	float x0 = midx - left.x;
-	vec2 p1 = vCtrl - left;
-	vec2 v = right - vCtrl;
-	float t = x0/(p1.x+sqrt(p1.x*p1.x+(v.x-p1.x)*x0));
-	// Find y(t) on the curve.
-	float y = mix(mix(left.y, vCtrl.y, t), mix(vCtrl.y, right.y, t), t);
-	// And the slope.
-	vec2 d_half = mix(p1, v, t);
-	float dy = d_half.y/d_half.x;
-	// Together, y and dy form a line approximation.
-
-	// Compute the fragment area above the line.
-	// The area is symmetric around dy = 0. Scale slope with extent width.
-	float width = extent.y - extent.x;
-	dy = abs(dy*width);
-
-	vec4 sides = vec4(dy*+0.5 + y, dy*-0.5 + y, (+0.5-y)/dy, (-0.5-y)/dy);
-	sides = clamp(sides+0.5, 0.0, 1.0);
-
-	float area = 0.5*(sides.z - sides.z*sides.y + 1.0 - sides.x+sides.x*sides.w);
-	area *= width;
-
-	// Work around issue #13.
-	if (width == 0.0)
-		area = 0.0;
-
-	fragCover.r = area;
-}
@@ -1,53 +0,0 @@
-#version 310 es
-
-// SPDX-License-Identifier: Unlicense OR MIT
-
-precision highp float;
-
-layout(binding = 0) uniform Block {
-	vec4 transform;
-	vec2 pathOffset;
-} _block;
-
-layout(location=0) in float corner;
-layout(location=1) in float maxy;
-layout(location=2) in vec2 from;
-layout(location=3) in vec2 ctrl;
-layout(location=4) in vec2 to;
-
-layout(location=0) out vec2 vFrom;
-layout(location=1) out vec2 vCtrl;
-layout(location=2) out vec2 vTo;
-
-void main() {
-	// Add a one pixel overlap so curve quads cover their
-	// entire curves. Could use conservative rasterization
-	// if available.
-	vec2 from = from + _block.pathOffset;
-	vec2 ctrl = ctrl + _block.pathOffset;
-	vec2 to = to + _block.pathOffset;
-	float maxy = maxy + _block.pathOffset.y;
-	vec2 pos;
-	float c = corner;
-	if (c >= 0.375) {
-		// North.
-		c -= 0.5;
-		pos.y = maxy + 1.0;
-	} else {
-		// South.
-		pos.y = min(min(from.y, ctrl.y), to.y) - 1.0;
-	}
-	if (c >= 0.125) {
-		// East.
-		pos.x = max(max(from.x, ctrl.x), to.x)+1.0;
-	} else {
-		// West.
-		pos.x = min(min(from.x, ctrl.x), to.x)-1.0;
-	}
-	vFrom = from-pos;
-	vCtrl = ctrl-pos;
-	vTo = to-pos;
-	pos = pos*_block.transform.xy + _block.transform.zw;
-	gl_Position = vec4(pos, 1, 1);
-}
-
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Code auto-generated by piet-gpu-derive
-
-struct PathRef {
-    uint offset;
-};
-
-struct TileRef {
-    uint offset;
-};
-
-struct TileSegRef {
-    uint offset;
-};
-
-struct TransformSegRef {
-    uint offset;
-};
-
-struct Path {
-    uvec4 bbox;
-    TileRef tiles;
-};
-
-#define Path_size 12
-
-PathRef Path_index(PathRef ref, uint index) {
-    return PathRef(ref.offset + index * Path_size);
-}
-
-struct Tile {
-    TileSegRef tile;
-    int backdrop;
-};
-
-#define Tile_size 8
-
-TileRef Tile_index(TileRef ref, uint index) {
-    return TileRef(ref.offset + index * Tile_size);
-}
-
-struct TileSeg {
-    vec2 origin;
-    vec2 vector;
-    float y_edge;
-    TileSegRef next;
-};
-
-#define TileSeg_size 24
-
-TileSegRef TileSeg_index(TileSegRef ref, uint index) {
-    return TileSegRef(ref.offset + index * TileSeg_size);
-}
-
-struct TransformSeg {
-    vec4 mat;
-    vec2 translate;
-};
-
-#define TransformSeg_size 24
-
-TransformSegRef TransformSeg_index(TransformSegRef ref, uint index) {
-    return TransformSegRef(ref.offset + index * TransformSeg_size);
-}
-
-Path Path_read(Alloc a, PathRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    Path s;
-    s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
-    s.tiles = TileRef(raw2);
-    return s;
-}
-
-void Path_write(Alloc a, PathRef ref, Path s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.bbox.x | (s.bbox.y << 16));
-    write_mem(a, ix + 1, s.bbox.z | (s.bbox.w << 16));
-    write_mem(a, ix + 2, s.tiles.offset);
-}
-
-Tile Tile_read(Alloc a, TileRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    Tile s;
-    s.tile = TileSegRef(raw0);
-    s.backdrop = int(raw1);
-    return s;
-}
-
-void Tile_write(Alloc a, TileRef ref, Tile s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, s.tile.offset);
-    write_mem(a, ix + 1, uint(s.backdrop));
-}
-
-TileSeg TileSeg_read(Alloc a, TileSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    TileSeg s;
-    s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
-    s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.y_edge = uintBitsToFloat(raw4);
-    s.next = TileSegRef(raw5);
-    return s;
-}
-
-void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.origin.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.origin.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.vector.x));
-    write_mem(a, ix + 3, floatBitsToUint(s.vector.y));
-    write_mem(a, ix + 4, floatBitsToUint(s.y_edge));
-    write_mem(a, ix + 5, s.next.offset);
-}
-
-TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) {
-    uint ix = ref.offset >> 2;
-    uint raw0 = read_mem(a, ix + 0);
-    uint raw1 = read_mem(a, ix + 1);
-    uint raw2 = read_mem(a, ix + 2);
-    uint raw3 = read_mem(a, ix + 3);
-    uint raw4 = read_mem(a, ix + 4);
-    uint raw5 = read_mem(a, ix + 5);
-    TransformSeg s;
-    s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
-    s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
-    return s;
-}
-
-void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) {
-    uint ix = ref.offset >> 2;
-    write_mem(a, ix + 0, floatBitsToUint(s.mat.x));
-    write_mem(a, ix + 1, floatBitsToUint(s.mat.y));
-    write_mem(a, ix + 2, floatBitsToUint(s.mat.z));
-    write_mem(a, ix + 3, floatBitsToUint(s.mat.w));
-    write_mem(a, ix + 4, floatBitsToUint(s.translate.x));
-    write_mem(a, ix + 5, floatBitsToUint(s.translate.y));
-}
-
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
-
-// Allocation and initialization of tiles for paths.
-
-#version 450
-#extension GL_GOOGLE_include_directive : enable
-
-#include "mem.h"
-#include "setup.h"
-
-#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
-#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
-
-layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
-
-layout(set = 0, binding = 1) readonly buffer ConfigBuf {
-    Config conf;
-};
-
-#include "annotated.h"
-#include "tile.h"
-
-// scale factors useful for converting coordinates to tiles
-#define SX (1.0 / float(TILE_WIDTH_PX))
-#define SY (1.0 / float(TILE_HEIGHT_PX))
-
-shared uint sh_tile_count[TILE_ALLOC_WG];
-shared MallocResult sh_tile_alloc;
-
-void main() {
-    uint th_ix = gl_LocalInvocationID.x;
-    uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
-
-    uint tag = Annotated_Nop;
-    if (element_ix < conf.n_elements) {
-        tag = Annotated_tag(conf.anno_alloc, ref).tag;
-    }
-    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
-    switch (tag) {
-    case Annotated_Color:
-    case Annotated_Image:
-    case Annotated_BeginClip:
-    case Annotated_EndClip:
-        // Note: we take advantage of the fact that fills, strokes, and
-        // clips have compatible layout.
-        AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
-        x0 = int(floor(clip.bbox.x * SX));
-        y0 = int(floor(clip.bbox.y * SY));
-        x1 = int(ceil(clip.bbox.z * SX));
-        y1 = int(ceil(clip.bbox.w * SY));
-        break;
-    }
-    x0 = clamp(x0, 0, int(conf.width_in_tiles));
-    y0 = clamp(y0, 0, int(conf.height_in_tiles));
-    x1 = clamp(x1, 0, int(conf.width_in_tiles));
-    y1 = clamp(y1, 0, int(conf.height_in_tiles));
-
-    Path path;
-    path.bbox = uvec4(x0, y0, x1, y1);
-    uint tile_count = (x1 - x0) * (y1 - y0);
-    if (tag == Annotated_EndClip) {
-        // Don't actually allocate tiles for an end clip, but we do want
-        // the path structure (especially bbox) allocated for it.
-        tile_count = 0;
-    }
-
-    sh_tile_count[th_ix] = tile_count;
-    uint total_tile_count = tile_count;
-    // Prefix sum of sh_tile_count
-    for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
-        barrier();
-        if (th_ix >= (1 << i)) {
-            total_tile_count += sh_tile_count[th_ix - (1 << i)];
-        }
-        barrier();
-        sh_tile_count[th_ix] = total_tile_count;
-    }
-    if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = malloc(total_tile_count * Tile_size);
-    }
-    barrier();
-    MallocResult alloc_start = sh_tile_alloc;
-    if (alloc_start.failed || mem_error != NO_ERROR) {
-        return;
-    }
-
-    if (element_ix < conf.n_elements) {
-        uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
-        path.tiles = TileRef(tiles_alloc.offset);
-        Path_write(conf.tile_alloc, path_ref, path);
-    }
-
-    // Zero out allocated tiles efficiently
-    uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start.alloc.offset >> 2;
-    for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
-        // Note: this interleaving is faster than using Tile_write
-        // by a significant amount.
-        write_mem(alloc_start.alloc, start_ix + i, 0);
-    }
-}