mirror of
https://git.sr.ht/~eliasnaur/gio
synced 2026-07-01 15:45:38 +00:00
0218546161
The piet-gpu project is dual licensed under the Apache 2.0 and MIT, and the shaders themselves are also offered under the UNLICENSE terms. See https://github.com/linebender/piet-gpu#license-and-contributions, as of commit 72e2dfab3da8ae1adf7a0fb056b71ccbc4cfa29a: "The piet-gpu project is dual-licensed under both Apache 2.0 and MIT licenses. In addition, the shaders are provided under the terms of the Unlicense. The intent is for this research to be used in as broad a context as possible." Signed-off-by: Elias Naur <mail@eliasnaur.com>
153 lines
5.0 KiB
Plaintext
153 lines
5.0 KiB
Plaintext
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
// The binning stage of the pipeline.
|
|
//
|
|
// Each workgroup processes N_TILE paths.
|
|
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
|
|
// based on the path bounding box to bin the paths.
|
|
|
|
#version 450
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
#include "mem.h"
|
|
#include "setup.h"
|
|
|
|
layout(local_size_x = N_TILE, local_size_y = 1) in;
|
|
|
|
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
|
Config conf;
|
|
};
|
|
|
|
#include "annotated.h"
|
|
#include "bins.h"
|
|
|
|
// scale factors useful for converting coordinates to bins
|
|
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
|
|
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
|
|
|
|
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
|
|
#define INFINITY (1.0 / 0.0)
|
|
|
|
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
|
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
|
|
shared uint bitmaps[N_SLICE][N_TILE];
|
|
shared uint count[N_SLICE][N_TILE];
|
|
shared Alloc sh_chunk_alloc[N_TILE];
|
|
shared bool sh_alloc_failed;
|
|
|
|
void main() {
|
|
if (mem_error != NO_ERROR) {
|
|
return;
|
|
}
|
|
|
|
uint my_n_elements = conf.n_elements;
|
|
uint my_partition = gl_WorkGroupID.x;
|
|
|
|
for (uint i = 0; i < N_SLICE; i++) {
|
|
bitmaps[i][gl_LocalInvocationID.x] = 0;
|
|
}
|
|
if (gl_LocalInvocationID.x == 0) {
|
|
sh_alloc_failed = false;
|
|
}
|
|
barrier();
|
|
|
|
// Read inputs and determine coverage of bins
|
|
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
|
|
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
|
uint tag = Annotated_Nop;
|
|
if (element_ix < my_n_elements) {
|
|
tag = Annotated_tag(conf.anno_alloc, ref);
|
|
}
|
|
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
|
switch (tag) {
|
|
case Annotated_Fill:
|
|
case Annotated_FillTexture:
|
|
case Annotated_Stroke:
|
|
case Annotated_BeginClip:
|
|
case Annotated_EndClip:
|
|
// Note: we take advantage of the fact that these drawing elements
|
|
// have the bbox at the same place in their layout.
|
|
AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
|
|
x0 = int(floor(fill.bbox.x * SX));
|
|
y0 = int(floor(fill.bbox.y * SY));
|
|
x1 = int(ceil(fill.bbox.z * SX));
|
|
y1 = int(ceil(fill.bbox.w * SY));
|
|
break;
|
|
}
|
|
|
|
// At this point, we run an iterator over the coverage area,
|
|
// trying to keep divergence low.
|
|
// Right now, it's just a bbox, but we'll get finer with
|
|
// segments.
|
|
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
|
|
uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
|
|
x0 = clamp(x0, 0, int(width_in_bins));
|
|
x1 = clamp(x1, x0, int(width_in_bins));
|
|
y0 = clamp(y0, 0, int(height_in_bins));
|
|
y1 = clamp(y1, y0, int(height_in_bins));
|
|
if (x0 == x1) y1 = y0;
|
|
int x = x0, y = y0;
|
|
uint my_slice = gl_LocalInvocationID.x / 32;
|
|
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
|
|
while (y < y1) {
|
|
atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
|
|
x++;
|
|
if (x == x1) {
|
|
x = x0;
|
|
y++;
|
|
}
|
|
}
|
|
|
|
barrier();
|
|
// Allocate output segments.
|
|
uint element_count = 0;
|
|
for (uint i = 0; i < N_SLICE; i++) {
|
|
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
|
|
count[i][gl_LocalInvocationID.x] = element_count;
|
|
}
|
|
// element_count is number of elements covering bin for this invocation.
|
|
Alloc chunk_alloc = new_alloc(0, 0);
|
|
if (element_count != 0) {
|
|
// TODO: aggregate atomic adds (subgroup is probably fastest)
|
|
MallocResult chunk = malloc(element_count * BinInstance_size);
|
|
chunk_alloc = chunk.alloc;
|
|
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
|
|
if (chunk.failed) {
|
|
sh_alloc_failed = true;
|
|
}
|
|
}
|
|
// Note: it might be more efficient for reading to do this in the
|
|
// other order (each bin is a contiguous sequence of partitions)
|
|
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
|
|
write_mem(conf.bin_alloc, out_ix, element_count);
|
|
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
|
|
|
|
barrier();
|
|
if (sh_alloc_failed) {
|
|
return;
|
|
}
|
|
|
|
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
|
|
// touched by this element
|
|
x = x0;
|
|
y = y0;
|
|
while (y < y1) {
|
|
uint bin_ix = y * width_in_bins + x;
|
|
uint out_mask = bitmaps[my_slice][bin_ix];
|
|
if ((out_mask & my_mask) != 0) {
|
|
uint idx = bitCount(out_mask & (my_mask - 1));
|
|
if (my_slice > 0) {
|
|
idx += count[my_slice - 1][bin_ix];
|
|
}
|
|
Alloc out_alloc = sh_chunk_alloc[bin_ix];
|
|
uint out_offset = out_alloc.offset + idx * BinInstance_size;
|
|
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
|
|
}
|
|
x++;
|
|
if (x == x1) {
|
|
x = x0;
|
|
y++;
|
|
}
|
|
}
|
|
}
|