gpu: [compute] add CPU fallback

This change adds a CPU fallback for devices that don't support the old
renderer nor have GPU support for compute programs.

Most of the hard work is implemented in the gioui.org/cpu module. It
uses the SwiftShader project with light modification to output
statically compiled CPU .o files for each compute program.

The CPU fallback only covers Linux and Android on arm, arm64, amd64
architectures. There is no fundamental reason support can't be extended
to other platforms:

- macOS and iOS are probably easy, but it's likely that virtually every
  device has GPU support for compute shaders.
- Windows needs a Cgo-less port, or a build constraint to require a C
  compiler (Gio core doesn't).
- FreeBSD and OpenBSD are probably also easy to do because they're so
  similar to Linux.
- The 386 binaries didn't work properly in my tests, so fixes to
  SwiftShader is probably needed. However, I expect virtually every
  Intel device can run amd64 binaries.

Updates gio#49
Fixes gio#228

Signed-off-by: Elias Naur <mail@eliasnaur.com>
This commit is contained in:
Elias Naur
2021-02-04 18:19:51 +01:00
parent f14c151883
commit ea38195e2e
5 changed files with 424 additions and 108 deletions
+1 -1
View File
@@ -8,4 +8,4 @@ require (
golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c
)
require golang.org/x/text v0.3.6 // indirect
require gioui.org/cpu v0.0.0-20210727122813-41509bcd3462
+2
View File
@@ -1,6 +1,8 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
gioui.org/cpu v0.0.0-20210727122813-41509bcd3462 h1:JZyB+d8tPExZHNZwMiGKeeAVd0mkFTc3Zsmegdn178M=
gioui.org/cpu v0.0.0-20210727122813-41509bcd3462/go.mod h1:DkhBDuHokSMOUxX5LZQ7IcxyJJzs3OON8Z5ojaXUXxo=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0=
+288 -102
View File
@@ -14,6 +14,7 @@ import (
"io/ioutil"
"math"
"math/bits"
"runtime"
"sort"
"time"
"unsafe"
@@ -28,6 +29,9 @@ import (
"gioui.org/layout"
"gioui.org/op"
"gioui.org/op/clip"
"gioui.org/cpu"
"gioui.org/cpu/piet"
)
type compute struct {
@@ -40,16 +44,16 @@ type compute struct {
maxTextureDim int
programs struct {
elements driver.Program
tileAlloc driver.Program
pathCoarse driver.Program
backdrop driver.Program
binning driver.Program
coarse driver.Program
kernel4 driver.Program
elements computeProgram
tileAlloc computeProgram
pathCoarse computeProgram
backdrop computeProgram
binning computeProgram
coarse computeProgram
kernel4 computeProgram
}
buffers struct {
config driver.Buffer
config sizedBuffer
scene sizedBuffer
state sizedBuffer
memory sizedBuffer
@@ -66,6 +70,8 @@ type compute struct {
layerVertices []layerVertex
layerAtlases []*layerAtlas
packer packer
descriptors *piet.Kernel4DescriptorSetLayout
}
// images contains ImageOp images packed into a texture atlas.
images struct {
@@ -94,6 +100,12 @@ type compute struct {
uniforms *materialUniforms
uniBuf driver.Buffer
// CPU fields
cpuTex cpu.ImageDescriptor
// regions track new materials in tex, so they can be transferred to cpuTex.
regions []image.Rectangle
scratch []byte
}
timers struct {
profile string
@@ -103,6 +115,10 @@ type compute struct {
blit *timer
}
// CPU fallback fields.
useCPU bool
dispatcher *dispatcher
// The following fields hold scratch space to avoid garbage.
zeroSlice []byte
memHeader *memoryHeader
@@ -124,10 +140,11 @@ type layerPlace struct {
type layerAtlas struct {
// image is the layer atlas texture. Note that it is in RGBA format,
// but contains data in sRGB. See blitLayers for more detail.
image driver.Texture
fbo driver.Framebuffer
size image.Point
layers int
image driver.Texture
fbo driver.Framebuffer
cpuImage cpu.ImageDescriptor
size image.Point
layers int
}
type copyUniforms struct {
@@ -274,9 +291,22 @@ type encodeState struct {
clip f32.Rectangle
}
// sizedBuffer holds a GPU buffer, or its equivalent CPU memory.
type sizedBuffer struct {
size int
buffer driver.Buffer
// cpuBuf is initialized when useCPU is true.
cpuBuf cpu.BufferDescriptor
}
// computeProgram holds a compute program, or its equivalent CPU implementation.
type computeProgram struct {
prog driver.Program
// CPU fields.
progInfo *cpu.ProgramInfo
descriptors unsafe.Pointer
buffers []*cpu.BufferDescriptor
}
// config matches Config in setup.h
@@ -332,7 +362,8 @@ const (
)
func newCompute(ctx driver.Device) (*compute, error) {
maxDim := ctx.Caps().MaxTextureSize
caps := ctx.Caps()
maxDim := caps.MaxTextureSize
// Large atlas textures cause artifacts due to precision loss in
// shaders.
if cap := 8192; maxDim > cap {
@@ -344,6 +375,35 @@ func newCompute(ctx driver.Device) (*compute, error) {
conf: new(config),
memHeader: new(memoryHeader),
}
shaders := []struct {
prog *computeProgram
src driver.ShaderSources
info *cpu.ProgramInfo
hash string
}{
{&g.programs.elements, shader_elements_comp, piet.ElementsProgramInfo, piet.ElementsHash},
{&g.programs.tileAlloc, shader_tile_alloc_comp, piet.Tile_allocProgramInfo, piet.Tile_allocHash},
{&g.programs.pathCoarse, shader_path_coarse_comp, piet.Path_coarseProgramInfo, piet.Path_coarseHash},
{&g.programs.backdrop, shader_backdrop_comp, piet.BackdropProgramInfo, piet.BackdropHash},
{&g.programs.binning, shader_binning_comp, piet.BinningProgramInfo, piet.BinningHash},
{&g.programs.coarse, shader_coarse_comp, piet.CoarseProgramInfo, piet.CoarseHash},
{&g.programs.kernel4, shader_kernel4_comp, piet.Kernel4ProgramInfo, piet.Kernel4Hash},
}
if !caps.Features.Has(driver.FeatureCompute) {
g.useCPU = supportsCPUCompute
for _, s := range shaders {
if s.src.Hash != s.hash {
g.useCPU = false
break
}
}
if !g.useCPU {
return nil, errors.New("gpu: missing support for compute programs")
}
}
if g.useCPU {
g.dispatcher = newDispatcher(runtime.NumCPU())
}
// Large enough for reasonable fill sizes, yet still spannable by the compute programs.
g.output.packer.maxDim = 4096
@@ -397,32 +457,55 @@ func newCompute(ctx driver.Device) (*compute, error) {
g.materials.uniBuf = buf
g.materials.prog.SetVertexUniforms(buf)
buf, err = ctx.NewBuffer(driver.BufferBindingShaderStorage, int(unsafe.Sizeof(config{})))
if err != nil {
g.Release()
return nil, err
}
g.buffers.config = buf
shaders := []struct {
prog *driver.Program
src driver.ShaderSources
}{
{&g.programs.elements, shader_elements_comp},
{&g.programs.tileAlloc, shader_tile_alloc_comp},
{&g.programs.pathCoarse, shader_path_coarse_comp},
{&g.programs.backdrop, shader_backdrop_comp},
{&g.programs.binning, shader_binning_comp},
{&g.programs.coarse, shader_coarse_comp},
{&g.programs.kernel4, shader_kernel4_comp},
}
for _, shader := range shaders {
p, err := ctx.NewComputeProgram(shader.src)
if err != nil {
g.Release()
return nil, err
if !g.useCPU {
p, err := ctx.NewComputeProgram(shader.src)
if err != nil {
g.Release()
return nil, err
}
shader.prog.prog = p
} else {
shader.prog.progInfo = shader.info
}
}
if g.useCPU {
{
desc := new(piet.ElementsDescriptorSetLayout)
g.programs.elements.descriptors = unsafe.Pointer(desc)
g.programs.elements.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1(), desc.Binding2(), desc.Binding3()}
}
{
desc := new(piet.Tile_allocDescriptorSetLayout)
g.programs.tileAlloc.descriptors = unsafe.Pointer(desc)
g.programs.tileAlloc.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
}
{
desc := new(piet.Path_coarseDescriptorSetLayout)
g.programs.pathCoarse.descriptors = unsafe.Pointer(desc)
g.programs.pathCoarse.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
}
{
desc := new(piet.BackdropDescriptorSetLayout)
g.programs.backdrop.descriptors = unsafe.Pointer(desc)
g.programs.backdrop.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
}
{
desc := new(piet.BinningDescriptorSetLayout)
g.programs.binning.descriptors = unsafe.Pointer(desc)
g.programs.binning.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
}
{
desc := new(piet.CoarseDescriptorSetLayout)
g.programs.coarse.descriptors = unsafe.Pointer(desc)
g.programs.coarse.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
}
{
desc := new(piet.Kernel4DescriptorSetLayout)
g.programs.kernel4.descriptors = unsafe.Pointer(desc)
g.programs.kernel4.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()}
g.output.descriptors = desc
}
*shader.prog = p
}
return g, nil
}
@@ -553,7 +636,7 @@ func (g *compute) compactLayers() error {
continue
}
outputSize := g.output.packer.sizes[0]
atlas.ensureSize(g.ctx, outputSize)
atlas.ensureSize(g.useCPU, g.ctx, outputSize)
for i, l := range layers[:end] {
if l.newPlace == l.place {
continue
@@ -629,10 +712,10 @@ func (g *compute) renderLayers(viewport image.Point) error {
Y: (outputSize.Y + tileHeightPx - 1) / tileHeightPx,
}
w, h := tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx
if err := atlas.ensureSize(g.ctx, image.Pt(w, h)); err != nil {
if err := atlas.ensureSize(g.useCPU, g.ctx, image.Pt(w, h)); err != nil {
return err
}
if err := g.render(atlas.image, tileDims); err != nil {
if err := g.render(atlas.image, atlas.cpuImage, tileDims, atlas.size.X*4); err != nil {
return err
}
}
@@ -694,7 +777,7 @@ func (g *compute) blitLayers(viewport image.Point) {
g.output.uniforms.uvScale = [2]float32{1 / float32(atlas.size.X), 1 / float32(atlas.size.Y)}
g.output.uniBuf.Upload(byteslice.Struct(g.output.uniforms))
vertexData := byteslice.Slice(g.output.layerVertices)
g.output.buffer.ensureCapacity(g.ctx, driver.BufferBindingVertices, len(vertexData))
g.output.buffer.ensureCapacity(false, g.ctx, driver.BufferBindingVertices, len(vertexData))
g.output.buffer.buffer.Upload(vertexData)
g.ctx.BindVertexBuffer(g.output.buffer.buffer, int(unsafe.Sizeof(g.output.layerVertices[0])), 0)
g.ctx.BindTexture(0, atlas.image)
@@ -705,6 +788,7 @@ func (g *compute) blitLayers(viewport image.Point) {
func (g *compute) renderMaterials() error {
m := &g.materials
m.quads = m.quads[:0]
m.regions = m.regions[:0]
resize := false
reclaimed := false
restart:
@@ -752,6 +836,10 @@ restart:
}
m.offsets[op.key] = offset
g.enc.setFillImageOffset(op.sceneIdx, offset.Sub(op.off))
m.regions = append(m.regions, image.Rectangle{
Min: place.Pos,
Max: place.Pos.Add(size),
})
}
break
}
@@ -768,6 +856,7 @@ restart:
m.tex.Release()
m.tex = nil
}
m.cpuTex.Free()
handle, err := g.ctx.NewTexture(driver.TextureFormatRGBA8, texSize, texSize,
driver.FilterNearest, driver.FilterNearest,
driver.BufferBindingShaderStorage|driver.BufferBindingFramebuffer)
@@ -781,6 +870,9 @@ restart:
}
m.tex = handle
m.fbo = fbo
if g.useCPU {
m.cpuTex = cpu.NewImageRGBA(texSize, texSize)
}
}
// Transform to clip space: [-1, -1] - [1, 1].
g.materials.uniforms.scale = [2]float32{2 / float32(texSize), 2 / float32(texSize)}
@@ -788,7 +880,7 @@ restart:
g.materials.uniBuf.Upload(byteslice.Struct(g.materials.uniforms))
vertexData := byteslice.Slice(m.quads)
n := pow2Ceil(len(vertexData))
m.buffer.ensureCapacity(g.ctx, driver.BufferBindingVertices, n)
m.buffer.ensureCapacity(false, g.ctx, driver.BufferBindingVertices, n)
m.buffer.buffer.Upload(vertexData)
g.ctx.BindTexture(0, g.images.tex)
g.ctx.BindFramebuffer(m.fbo)
@@ -962,7 +1054,7 @@ func (enc *encoder) encodePath(verts []byte) {
}
}
func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
func (g *compute) render(dst driver.Texture, cpuDst cpu.ImageDescriptor, tileDims image.Point, stride int) error {
const (
// wgSize is the largest and most common workgroup size.
wgSize = 128
@@ -985,16 +1077,11 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
if s := len(scene); s > g.buffers.scene.size {
realloced = true
paddedCap := s * 11 / 10
if err := g.buffers.scene.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil {
if err := g.buffers.scene.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil {
return err
}
}
g.buffers.scene.buffer.Upload(scene)
g.ctx.BindImageTexture(kernel4OutputUnit, dst, driver.AccessWrite, driver.TextureFormatRGBA8)
if t := g.materials.tex; t != nil {
g.ctx.BindImageTexture(kernel4AtlasUnit, t, driver.AccessRead, driver.TextureFormatRGBA8)
}
g.buffers.scene.upload(scene)
// alloc is the number of allocated bytes for static buffers.
var alloc uint32
@@ -1027,12 +1114,14 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
if clearSize > g.buffers.state.size {
realloced = true
paddedCap := clearSize * 11 / 10
if err := g.buffers.state.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil {
if err := g.buffers.state.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil {
return err
}
}
g.buffers.config.Upload(byteslice.Struct(g.conf))
confData := byteslice.Struct(g.conf)
g.buffers.config.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, len(confData))
g.buffers.config.upload(confData)
minSize := int(unsafe.Sizeof(memoryHeader{})) + int(alloc)
if minSize > g.buffers.memory.size {
@@ -1040,45 +1129,53 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
// Add space for dynamic GPU allocations.
const sizeBump = 4 * 1024 * 1024
minSize += sizeBump
if err := g.buffers.memory.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, minSize); err != nil {
if err := g.buffers.memory.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, minSize); err != nil {
return err
}
}
if !g.useCPU {
g.ctx.BindImageTexture(kernel4OutputUnit, dst, driver.AccessWrite, driver.TextureFormatRGBA8)
if t := g.materials.tex; t != nil {
g.ctx.BindImageTexture(kernel4AtlasUnit, t, driver.AccessRead, driver.TextureFormatRGBA8)
}
} else {
*g.output.descriptors.Binding2() = cpuDst
*g.output.descriptors.Binding3() = g.materials.cpuTex
}
for {
*g.memHeader = memoryHeader{
mem_offset: alloc,
}
g.buffers.memory.buffer.Upload(byteslice.Struct(g.memHeader))
g.buffers.state.buffer.Upload(g.zeros(clearSize))
g.buffers.memory.upload(byteslice.Struct(g.memHeader))
g.buffers.state.upload(g.zeros(clearSize))
if realloced {
realloced = false
g.bindBuffers()
}
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.elements)
g.ctx.DispatchCompute(numPartitions, 1, 1)
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.tileAlloc)
g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.pathCoarse)
g.ctx.DispatchCompute((enc.npathseg+31)/32, 1, 1)
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.backdrop)
g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
g.memoryBarrier()
g.dispatch(g.programs.elements, numPartitions, 1, 1)
g.memoryBarrier()
g.dispatch(g.programs.tileAlloc, (enc.npath+wgSize-1)/wgSize, 1, 1)
g.memoryBarrier()
g.dispatch(g.programs.pathCoarse, (enc.npathseg+31)/32, 1, 1)
g.memoryBarrier()
g.dispatch(g.programs.backdrop, (enc.npath+wgSize-1)/wgSize, 1, 1)
// No barrier needed between backdrop and binning.
g.ctx.BindProgram(g.programs.binning)
g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1)
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.coarse)
g.ctx.DispatchCompute(widthInBins, heightInBins, 1)
g.ctx.MemoryBarrier()
g.ctx.BindProgram(g.programs.kernel4)
g.ctx.DispatchCompute(tileDims.X, tileDims.Y, 1)
g.ctx.MemoryBarrier()
g.dispatch(g.programs.binning, (enc.npath+wgSize-1)/wgSize, 1, 1)
g.memoryBarrier()
g.dispatch(g.programs.coarse, widthInBins, heightInBins, 1)
g.memoryBarrier()
g.downloadMaterials()
g.dispatch(g.programs.kernel4, tileDims.X, tileDims.Y, 1)
g.memoryBarrier()
if g.useCPU {
g.dispatcher.Sync()
}
if err := g.buffers.memory.buffer.Download(byteslice.Struct(g.memHeader)); err != nil {
if err := g.buffers.memory.download(byteslice.Struct(g.memHeader)); err != nil {
if err == driver.ErrContentLost {
continue
}
@@ -1086,12 +1183,16 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
}
switch errCode := g.memHeader.mem_error; errCode {
case memNoError:
if g.useCPU {
w, h := tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx
dst.Upload(image.Pt(0, 0), image.Pt(w, h), cpuDst.Data(), stride)
}
return nil
case memMallocFailed:
// Resize memory and try again.
realloced = true
sz := g.buffers.memory.size * 15 / 10
if err := g.buffers.memory.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, sz); err != nil {
if err := g.buffers.memory.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, sz); err != nil {
return err
}
continue
@@ -1101,6 +1202,48 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error {
}
}
func (g *compute) downloadMaterials() {
m := &g.materials
if !g.useCPU || len(m.regions) == 0 {
return
}
copyFBO := m.fbo
data := m.cpuTex.Data()
for _, r := range m.regions {
dims := r.Size()
if n := dims.X * dims.Y * 4; n > len(m.scratch) {
m.scratch = make([]byte, n)
}
copyFBO.ReadPixels(r, m.scratch)
stride := m.packer.maxDim * 4
col := r.Min.X * 4
row := stride * r.Min.Y
off := col + row
w := dims.X * 4
for y := 0; y < dims.Y; y++ {
copy(data[off:off+w], m.scratch[y*dims.X*4:])
off += stride
}
}
}
func (g *compute) memoryBarrier() {
if !g.useCPU {
g.ctx.MemoryBarrier()
} else {
g.dispatcher.Barrier()
}
}
func (g *compute) dispatch(p computeProgram, x, y, z int) {
if !g.useCPU {
g.ctx.BindProgram(p.prog)
g.ctx.DispatchCompute(x, y, z)
} else {
g.dispatcher.Dispatch(p.progInfo, p.descriptors, x, y, z)
}
}
// zeros returns a byte slice with size bytes of zeros.
func (g *compute) zeros(size int) []byte {
if cap(g.zeroSlice) < size {
@@ -1109,7 +1252,7 @@ func (g *compute) zeros(size int) []byte {
return g.zeroSlice[:size]
}
func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error {
func (a *layerAtlas) ensureSize(useCPU bool, ctx driver.Device, size image.Point) error {
if a.size.X >= size.X && a.size.Y >= size.Y {
return nil
}
@@ -1122,6 +1265,8 @@ func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error {
a.image.Release()
a.image = nil
}
a.cpuImage.Free()
img, err := ctx.NewTexture(driver.TextureFormatRGBA8, size.X, size.Y,
driver.FilterNearest,
driver.FilterNearest,
@@ -1136,29 +1281,35 @@ func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error {
}
a.fbo = fbo
a.image = img
if useCPU {
a.cpuImage = cpu.NewImageRGBA(size.X, size.Y)
}
a.size = size
return nil
}
func (g *compute) Release() {
if g.useCPU {
g.dispatcher.Stop()
}
type resource interface {
Release()
}
res := []resource{
g.programs.elements,
g.programs.tileAlloc,
g.programs.pathCoarse,
g.programs.backdrop,
g.programs.binning,
g.programs.coarse,
g.programs.kernel4,
&g.programs.elements,
&g.programs.tileAlloc,
&g.programs.pathCoarse,
&g.programs.backdrop,
&g.programs.binning,
&g.programs.coarse,
&g.programs.kernel4,
g.output.blitProg,
&g.output.buffer,
g.output.uniBuf,
&g.buffers.scene,
&g.buffers.state,
&g.buffers.memory,
g.buffers.config,
&g.buffers.config,
g.images.tex,
g.materials.layout,
g.materials.prog,
@@ -1168,6 +1319,7 @@ func (g *compute) Release() {
g.materials.uniBuf,
g.timers.t,
}
g.materials.cpuTex.Free()
for _, r := range res {
if r != nil {
r.Release()
@@ -1180,48 +1332,82 @@ func (g *compute) Release() {
if a.image != nil {
a.image.Release()
}
a.cpuImage.Free()
}
*g = compute{}
}
func (g *compute) bindBuffers() {
bindStorageBuffers(g.programs.elements, g.buffers.memory.buffer, g.buffers.config, g.buffers.scene.buffer, g.buffers.state.buffer)
bindStorageBuffers(g.programs.tileAlloc, g.buffers.memory.buffer, g.buffers.config)
bindStorageBuffers(g.programs.pathCoarse, g.buffers.memory.buffer, g.buffers.config)
bindStorageBuffers(g.programs.backdrop, g.buffers.memory.buffer, g.buffers.config)
bindStorageBuffers(g.programs.binning, g.buffers.memory.buffer, g.buffers.config)
bindStorageBuffers(g.programs.coarse, g.buffers.memory.buffer, g.buffers.config)
bindStorageBuffers(g.programs.kernel4, g.buffers.memory.buffer, g.buffers.config)
g.bindStorageBuffers(g.programs.elements, g.buffers.memory, g.buffers.config, g.buffers.scene, g.buffers.state)
g.bindStorageBuffers(g.programs.tileAlloc, g.buffers.memory, g.buffers.config)
g.bindStorageBuffers(g.programs.pathCoarse, g.buffers.memory, g.buffers.config)
g.bindStorageBuffers(g.programs.backdrop, g.buffers.memory, g.buffers.config)
g.bindStorageBuffers(g.programs.binning, g.buffers.memory, g.buffers.config)
g.bindStorageBuffers(g.programs.coarse, g.buffers.memory, g.buffers.config)
g.bindStorageBuffers(g.programs.kernel4, g.buffers.memory, g.buffers.config)
}
func (p *computeProgram) Release() {
if p.prog != nil {
p.prog.Release()
}
*p = computeProgram{}
}
func (b *sizedBuffer) Release() {
if b.buffer == nil {
return
}
b.buffer.Release()
b.cpuBuf.Free()
*b = sizedBuffer{}
}
func (b *sizedBuffer) ensureCapacity(ctx driver.Device, binding driver.BufferBinding, size int) error {
func (b *sizedBuffer) ensureCapacity(useCPU bool, ctx driver.Device, binding driver.BufferBinding, size int) error {
if b.size >= size {
return nil
}
if b.buffer != nil {
b.Release()
}
buf, err := ctx.NewBuffer(binding, size)
if err != nil {
return err
b.cpuBuf.Free()
if !useCPU {
buf, err := ctx.NewBuffer(binding, size)
if err != nil {
return err
}
b.buffer = buf
} else {
b.cpuBuf = cpu.NewBuffer(size)
}
b.buffer = buf
b.size = size
return nil
}
func bindStorageBuffers(prog driver.Program, buffers ...driver.Buffer) {
func (b *sizedBuffer) download(data []byte) error {
if b.buffer != nil {
return b.buffer.Download(data)
} else {
copy(data, b.cpuBuf.Data())
return nil
}
}
func (b *sizedBuffer) upload(data []byte) {
if b.buffer != nil {
b.buffer.Upload(data)
} else {
copy(b.cpuBuf.Data(), data)
}
}
func (g *compute) bindStorageBuffers(prog computeProgram, buffers ...sizedBuffer) {
for i, buf := range buffers {
prog.SetStorageBuffer(i, buf)
if !g.useCPU {
prog.prog.SetStorageBuffer(i, buf.buffer)
} else {
*prog.buffers[i] = buf.cpuBuf
}
}
}
+132
View File
@@ -0,0 +1,132 @@
// SPDX-License-Identifier: Unlicense OR MIT
package gpu
import (
"runtime"
"unsafe"
"gioui.org/cpu"
)
const supportsCPUCompute = runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" || runtime.GOARCH == "arm"
// This file contains code specific to running compute shaders on the CPU.
// dispatcher dispatches CPU compute programs across multiple goroutines.
type dispatcher struct {
// done is notified when a worker completes its work slice.
done chan struct{}
// work receives work slice indices. It is closed when the dispatcher is released.
work chan work
// dispatch receives compute jobs, which is then split among workers.
dispatch chan dispatch
// sync receives notification when a Sync completes.
sync chan struct{}
}
type work struct {
ctx *cpu.DispatchContext
index int
}
type dispatch struct {
_type jobType
program *cpu.ProgramInfo
descSet unsafe.Pointer
x, y, z int
}
type jobType uint8
const (
jobDispatch jobType = iota
jobBarrier
jobSync
)
func newDispatcher(workers int) *dispatcher {
d := &dispatcher{
work: make(chan work, workers),
done: make(chan struct{}, workers),
// Leave some room to avoid blocking calls to Dispatch.
dispatch: make(chan dispatch, 20),
sync: make(chan struct{}),
}
for i := 0; i < workers; i++ {
go d.worker()
}
go d.dispatcher()
return d
}
func (d *dispatcher) dispatcher() {
defer close(d.work)
var free []*cpu.DispatchContext
defer func() {
for _, ctx := range free {
ctx.Free()
}
}()
var used []*cpu.DispatchContext
for job := range d.dispatch {
switch job._type {
case jobDispatch:
if len(free) == 0 {
free = append(free, cpu.NewDispatchContext())
}
ctx := free[len(free)-1]
free = free[:len(free)-1]
used = append(used, ctx)
ctx.Prepare(cap(d.work), job.program, job.descSet, job.x, job.y, job.z)
for i := 0; i < cap(d.work); i++ {
d.work <- work{
ctx: ctx,
index: i,
}
}
case jobBarrier:
// Wait for all outstanding dispatches to complete.
for i := 0; i < len(used)*cap(d.work); i++ {
<-d.done
}
free = append(free, used...)
used = used[:0]
case jobSync:
d.sync <- struct{}{}
}
}
}
func (d *dispatcher) worker() {
thread := cpu.NewThreadContext()
defer thread.Free()
for w := range d.work {
w.ctx.Dispatch(w.index, thread)
d.done <- struct{}{}
}
}
func (d *dispatcher) Barrier() {
d.dispatch <- dispatch{_type: jobBarrier}
}
func (d *dispatcher) Sync() {
d.dispatch <- dispatch{_type: jobSync}
<-d.sync
}
func (d *dispatcher) Dispatch(program *cpu.ProgramInfo, descSet unsafe.Pointer, x, y, z int) {
d.dispatch <- dispatch{
_type: jobDispatch,
program: program,
descSet: descSet,
x: x,
y: y,
z: z,
}
}
func (d *dispatcher) Stop() {
close(d.dispatch)
}
+1 -5
View File
@@ -9,7 +9,6 @@ package gpu
import (
"encoding/binary"
"errors"
"fmt"
"image"
"image/color"
@@ -364,11 +363,8 @@ func New(api API) (GPU, error) {
switch {
case !forceCompute && feats.Has(driver.FeatureFloatRenderTargets):
return newGPU(d)
case feats.Has(driver.FeatureCompute):
return newCompute(d)
default:
return nil, errors.New("gpu: no support for float render targets nor compute")
}
return newCompute(d)
}
func newGPU(ctx driver.Device) (*gpu, error) {