gpu/shaders: update piet-gpu

Changes:
- faster implementation of RGBA output
- fix stroked clips and images

Signed-off-by: Elias Naur <mail@eliasnaur.com>
This commit is contained in:
Elias Naur
2021-03-31 19:55:29 +02:00
parent 5e1a662b94
commit ebf2dcea50
6 changed files with 106 additions and 128 deletions
+37 -44
View File
@@ -91,6 +91,23 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit
return true;
}
void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) {
if (fill_mode_from_flags(flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(alloc, cmd_ref);
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
}
void main() {
if (mem_error != NO_ERROR) {
return;
@@ -135,6 +152,12 @@ void main() {
uint part_start_ix = 0;
uint ready_ix = 0;
// Leave room for the fine rasterizer scratch allocation.
Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size);
cmd_ref.offset += Alloc_size;
uint num_begin_slots = 0;
uint begin_slot = 0;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
@@ -320,20 +343,7 @@ void main() {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth);
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
cmd_ref.offset += 4 + CmdColor_size;
break;
@@ -344,20 +354,7 @@ void main() {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill_img.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth);
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
cmd_ref.offset += 4 + CmdImage_size;
break;
@@ -373,27 +370,14 @@ void main() {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
cmd_ref.offset += 4 + CmdFill_size;
} else {
// TODO: here is where a bunch of optimization magic should happen
float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
Cmd_Alpha_write(cmd_alloc, cmd_ref, CmdAlpha(alpha));
cmd_ref.offset += 4 + CmdAlpha_size;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * begin_clip.linewidth);
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
cmd_ref.offset += 4 + CmdStroke_size;
}
write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth);
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
if (clip_depth < 32) {
clip_one_mask &= ~(1 << clip_depth);
}
begin_slot++;
num_begin_slots = max(num_begin_slots, begin_slot);
}
clip_depth++;
break;
@@ -405,6 +389,7 @@ void main() {
}
Cmd_Solid_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
begin_slot--;
Cmd_EndClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
}
@@ -432,5 +417,13 @@ void main() {
}
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
Cmd_End_write(cmd_alloc, cmd_ref);
if (num_begin_slots > 0) {
// Write scratch allocation: one state per BeginClip per rasterizer chunk.
uint scratch_size = num_begin_slots * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
MallocResult scratch = malloc(scratch_size);
// Ignore scratch.failed; we don't use the allocation and kernel4
// checks for memory overflow before using it.
alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
}
}
}