diff --git a/go.mod b/go.mod index 83cb118a..bff4905f 100644 --- a/go.mod +++ b/go.mod @@ -8,4 +8,4 @@ require ( golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c ) -require golang.org/x/text v0.3.6 // indirect +require gioui.org/cpu v0.0.0-20210727122813-41509bcd3462 diff --git a/go.sum b/go.sum index db2ff02a..91727ead 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,8 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +gioui.org/cpu v0.0.0-20210727122813-41509bcd3462 h1:JZyB+d8tPExZHNZwMiGKeeAVd0mkFTc3Zsmegdn178M= +gioui.org/cpu v0.0.0-20210727122813-41509bcd3462/go.mod h1:DkhBDuHokSMOUxX5LZQ7IcxyJJzs3OON8Z5ojaXUXxo= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0= diff --git a/gpu/compute.go b/gpu/compute.go index a80c217c..e328f0f3 100644 --- a/gpu/compute.go +++ b/gpu/compute.go @@ -14,6 +14,7 @@ import ( "io/ioutil" "math" "math/bits" + "runtime" "sort" "time" "unsafe" @@ -28,6 +29,9 @@ import ( "gioui.org/layout" "gioui.org/op" "gioui.org/op/clip" + + "gioui.org/cpu" + "gioui.org/cpu/piet" ) type compute struct { @@ -40,16 +44,16 @@ type compute struct { maxTextureDim int programs struct { - elements driver.Program - tileAlloc driver.Program - pathCoarse driver.Program - backdrop driver.Program - binning driver.Program - coarse driver.Program - kernel4 driver.Program + elements computeProgram + tileAlloc computeProgram + pathCoarse computeProgram + backdrop computeProgram + binning computeProgram + coarse computeProgram + kernel4 computeProgram } buffers struct { - config driver.Buffer + config sizedBuffer scene sizedBuffer state sizedBuffer memory sizedBuffer @@ -66,6 +70,8 @@ type compute struct { layerVertices []layerVertex layerAtlases []*layerAtlas packer packer + + descriptors *piet.Kernel4DescriptorSetLayout } // images contains ImageOp images packed into a texture atlas. images struct { @@ -94,6 +100,12 @@ type compute struct { uniforms *materialUniforms uniBuf driver.Buffer + + // CPU fields + cpuTex cpu.ImageDescriptor + // regions track new materials in tex, so they can be transferred to cpuTex. + regions []image.Rectangle + scratch []byte } timers struct { profile string @@ -103,6 +115,10 @@ type compute struct { blit *timer } + // CPU fallback fields. + useCPU bool + dispatcher *dispatcher + // The following fields hold scratch space to avoid garbage. zeroSlice []byte memHeader *memoryHeader @@ -124,10 +140,11 @@ type layerPlace struct { type layerAtlas struct { // image is the layer atlas texture. Note that it is in RGBA format, // but contains data in sRGB. See blitLayers for more detail. - image driver.Texture - fbo driver.Framebuffer - size image.Point - layers int + image driver.Texture + fbo driver.Framebuffer + cpuImage cpu.ImageDescriptor + size image.Point + layers int } type copyUniforms struct { @@ -274,9 +291,22 @@ type encodeState struct { clip f32.Rectangle } +// sizedBuffer holds a GPU buffer, or its equivalent CPU memory. type sizedBuffer struct { size int buffer driver.Buffer + // cpuBuf is initialized when useCPU is true. + cpuBuf cpu.BufferDescriptor +} + +// computeProgram holds a compute program, or its equivalent CPU implementation. +type computeProgram struct { + prog driver.Program + + // CPU fields. + progInfo *cpu.ProgramInfo + descriptors unsafe.Pointer + buffers []*cpu.BufferDescriptor } // config matches Config in setup.h @@ -332,7 +362,8 @@ const ( ) func newCompute(ctx driver.Device) (*compute, error) { - maxDim := ctx.Caps().MaxTextureSize + caps := ctx.Caps() + maxDim := caps.MaxTextureSize // Large atlas textures cause artifacts due to precision loss in // shaders. if cap := 8192; maxDim > cap { @@ -344,6 +375,35 @@ func newCompute(ctx driver.Device) (*compute, error) { conf: new(config), memHeader: new(memoryHeader), } + shaders := []struct { + prog *computeProgram + src driver.ShaderSources + info *cpu.ProgramInfo + hash string + }{ + {&g.programs.elements, shader_elements_comp, piet.ElementsProgramInfo, piet.ElementsHash}, + {&g.programs.tileAlloc, shader_tile_alloc_comp, piet.Tile_allocProgramInfo, piet.Tile_allocHash}, + {&g.programs.pathCoarse, shader_path_coarse_comp, piet.Path_coarseProgramInfo, piet.Path_coarseHash}, + {&g.programs.backdrop, shader_backdrop_comp, piet.BackdropProgramInfo, piet.BackdropHash}, + {&g.programs.binning, shader_binning_comp, piet.BinningProgramInfo, piet.BinningHash}, + {&g.programs.coarse, shader_coarse_comp, piet.CoarseProgramInfo, piet.CoarseHash}, + {&g.programs.kernel4, shader_kernel4_comp, piet.Kernel4ProgramInfo, piet.Kernel4Hash}, + } + if !caps.Features.Has(driver.FeatureCompute) { + g.useCPU = supportsCPUCompute + for _, s := range shaders { + if s.src.Hash != s.hash { + g.useCPU = false + break + } + } + if !g.useCPU { + return nil, errors.New("gpu: missing support for compute programs") + } + } + if g.useCPU { + g.dispatcher = newDispatcher(runtime.NumCPU()) + } // Large enough for reasonable fill sizes, yet still spannable by the compute programs. g.output.packer.maxDim = 4096 @@ -397,32 +457,55 @@ func newCompute(ctx driver.Device) (*compute, error) { g.materials.uniBuf = buf g.materials.prog.SetVertexUniforms(buf) - buf, err = ctx.NewBuffer(driver.BufferBindingShaderStorage, int(unsafe.Sizeof(config{}))) - if err != nil { - g.Release() - return nil, err - } - g.buffers.config = buf - - shaders := []struct { - prog *driver.Program - src driver.ShaderSources - }{ - {&g.programs.elements, shader_elements_comp}, - {&g.programs.tileAlloc, shader_tile_alloc_comp}, - {&g.programs.pathCoarse, shader_path_coarse_comp}, - {&g.programs.backdrop, shader_backdrop_comp}, - {&g.programs.binning, shader_binning_comp}, - {&g.programs.coarse, shader_coarse_comp}, - {&g.programs.kernel4, shader_kernel4_comp}, - } for _, shader := range shaders { - p, err := ctx.NewComputeProgram(shader.src) - if err != nil { - g.Release() - return nil, err + if !g.useCPU { + p, err := ctx.NewComputeProgram(shader.src) + if err != nil { + g.Release() + return nil, err + } + shader.prog.prog = p + } else { + shader.prog.progInfo = shader.info + } + } + if g.useCPU { + { + desc := new(piet.ElementsDescriptorSetLayout) + g.programs.elements.descriptors = unsafe.Pointer(desc) + g.programs.elements.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1(), desc.Binding2(), desc.Binding3()} + } + { + desc := new(piet.Tile_allocDescriptorSetLayout) + g.programs.tileAlloc.descriptors = unsafe.Pointer(desc) + g.programs.tileAlloc.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + } + { + desc := new(piet.Path_coarseDescriptorSetLayout) + g.programs.pathCoarse.descriptors = unsafe.Pointer(desc) + g.programs.pathCoarse.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + } + { + desc := new(piet.BackdropDescriptorSetLayout) + g.programs.backdrop.descriptors = unsafe.Pointer(desc) + g.programs.backdrop.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + } + { + desc := new(piet.BinningDescriptorSetLayout) + g.programs.binning.descriptors = unsafe.Pointer(desc) + g.programs.binning.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + } + { + desc := new(piet.CoarseDescriptorSetLayout) + g.programs.coarse.descriptors = unsafe.Pointer(desc) + g.programs.coarse.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + } + { + desc := new(piet.Kernel4DescriptorSetLayout) + g.programs.kernel4.descriptors = unsafe.Pointer(desc) + g.programs.kernel4.buffers = []*cpu.BufferDescriptor{desc.Binding0(), desc.Binding1()} + g.output.descriptors = desc } - *shader.prog = p } return g, nil } @@ -553,7 +636,7 @@ func (g *compute) compactLayers() error { continue } outputSize := g.output.packer.sizes[0] - atlas.ensureSize(g.ctx, outputSize) + atlas.ensureSize(g.useCPU, g.ctx, outputSize) for i, l := range layers[:end] { if l.newPlace == l.place { continue @@ -629,10 +712,10 @@ func (g *compute) renderLayers(viewport image.Point) error { Y: (outputSize.Y + tileHeightPx - 1) / tileHeightPx, } w, h := tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx - if err := atlas.ensureSize(g.ctx, image.Pt(w, h)); err != nil { + if err := atlas.ensureSize(g.useCPU, g.ctx, image.Pt(w, h)); err != nil { return err } - if err := g.render(atlas.image, tileDims); err != nil { + if err := g.render(atlas.image, atlas.cpuImage, tileDims, atlas.size.X*4); err != nil { return err } } @@ -694,7 +777,7 @@ func (g *compute) blitLayers(viewport image.Point) { g.output.uniforms.uvScale = [2]float32{1 / float32(atlas.size.X), 1 / float32(atlas.size.Y)} g.output.uniBuf.Upload(byteslice.Struct(g.output.uniforms)) vertexData := byteslice.Slice(g.output.layerVertices) - g.output.buffer.ensureCapacity(g.ctx, driver.BufferBindingVertices, len(vertexData)) + g.output.buffer.ensureCapacity(false, g.ctx, driver.BufferBindingVertices, len(vertexData)) g.output.buffer.buffer.Upload(vertexData) g.ctx.BindVertexBuffer(g.output.buffer.buffer, int(unsafe.Sizeof(g.output.layerVertices[0])), 0) g.ctx.BindTexture(0, atlas.image) @@ -705,6 +788,7 @@ func (g *compute) blitLayers(viewport image.Point) { func (g *compute) renderMaterials() error { m := &g.materials m.quads = m.quads[:0] + m.regions = m.regions[:0] resize := false reclaimed := false restart: @@ -752,6 +836,10 @@ restart: } m.offsets[op.key] = offset g.enc.setFillImageOffset(op.sceneIdx, offset.Sub(op.off)) + m.regions = append(m.regions, image.Rectangle{ + Min: place.Pos, + Max: place.Pos.Add(size), + }) } break } @@ -768,6 +856,7 @@ restart: m.tex.Release() m.tex = nil } + m.cpuTex.Free() handle, err := g.ctx.NewTexture(driver.TextureFormatRGBA8, texSize, texSize, driver.FilterNearest, driver.FilterNearest, driver.BufferBindingShaderStorage|driver.BufferBindingFramebuffer) @@ -781,6 +870,9 @@ restart: } m.tex = handle m.fbo = fbo + if g.useCPU { + m.cpuTex = cpu.NewImageRGBA(texSize, texSize) + } } // Transform to clip space: [-1, -1] - [1, 1]. g.materials.uniforms.scale = [2]float32{2 / float32(texSize), 2 / float32(texSize)} @@ -788,7 +880,7 @@ restart: g.materials.uniBuf.Upload(byteslice.Struct(g.materials.uniforms)) vertexData := byteslice.Slice(m.quads) n := pow2Ceil(len(vertexData)) - m.buffer.ensureCapacity(g.ctx, driver.BufferBindingVertices, n) + m.buffer.ensureCapacity(false, g.ctx, driver.BufferBindingVertices, n) m.buffer.buffer.Upload(vertexData) g.ctx.BindTexture(0, g.images.tex) g.ctx.BindFramebuffer(m.fbo) @@ -962,7 +1054,7 @@ func (enc *encoder) encodePath(verts []byte) { } } -func (g *compute) render(dst driver.Texture, tileDims image.Point) error { +func (g *compute) render(dst driver.Texture, cpuDst cpu.ImageDescriptor, tileDims image.Point, stride int) error { const ( // wgSize is the largest and most common workgroup size. wgSize = 128 @@ -985,16 +1077,11 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error { if s := len(scene); s > g.buffers.scene.size { realloced = true paddedCap := s * 11 / 10 - if err := g.buffers.scene.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil { + if err := g.buffers.scene.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil { return err } } - g.buffers.scene.buffer.Upload(scene) - - g.ctx.BindImageTexture(kernel4OutputUnit, dst, driver.AccessWrite, driver.TextureFormatRGBA8) - if t := g.materials.tex; t != nil { - g.ctx.BindImageTexture(kernel4AtlasUnit, t, driver.AccessRead, driver.TextureFormatRGBA8) - } + g.buffers.scene.upload(scene) // alloc is the number of allocated bytes for static buffers. var alloc uint32 @@ -1027,12 +1114,14 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error { if clearSize > g.buffers.state.size { realloced = true paddedCap := clearSize * 11 / 10 - if err := g.buffers.state.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil { + if err := g.buffers.state.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, paddedCap); err != nil { return err } } - g.buffers.config.Upload(byteslice.Struct(g.conf)) + confData := byteslice.Struct(g.conf) + g.buffers.config.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, len(confData)) + g.buffers.config.upload(confData) minSize := int(unsafe.Sizeof(memoryHeader{})) + int(alloc) if minSize > g.buffers.memory.size { @@ -1040,45 +1129,53 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error { // Add space for dynamic GPU allocations. const sizeBump = 4 * 1024 * 1024 minSize += sizeBump - if err := g.buffers.memory.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, minSize); err != nil { + if err := g.buffers.memory.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, minSize); err != nil { return err } } + + if !g.useCPU { + g.ctx.BindImageTexture(kernel4OutputUnit, dst, driver.AccessWrite, driver.TextureFormatRGBA8) + if t := g.materials.tex; t != nil { + g.ctx.BindImageTexture(kernel4AtlasUnit, t, driver.AccessRead, driver.TextureFormatRGBA8) + } + } else { + *g.output.descriptors.Binding2() = cpuDst + *g.output.descriptors.Binding3() = g.materials.cpuTex + } + for { *g.memHeader = memoryHeader{ mem_offset: alloc, } - g.buffers.memory.buffer.Upload(byteslice.Struct(g.memHeader)) - g.buffers.state.buffer.Upload(g.zeros(clearSize)) + g.buffers.memory.upload(byteslice.Struct(g.memHeader)) + g.buffers.state.upload(g.zeros(clearSize)) if realloced { realloced = false g.bindBuffers() } - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.elements) - g.ctx.DispatchCompute(numPartitions, 1, 1) - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.tileAlloc) - g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1) - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.pathCoarse) - g.ctx.DispatchCompute((enc.npathseg+31)/32, 1, 1) - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.backdrop) - g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1) + g.memoryBarrier() + g.dispatch(g.programs.elements, numPartitions, 1, 1) + g.memoryBarrier() + g.dispatch(g.programs.tileAlloc, (enc.npath+wgSize-1)/wgSize, 1, 1) + g.memoryBarrier() + g.dispatch(g.programs.pathCoarse, (enc.npathseg+31)/32, 1, 1) + g.memoryBarrier() + g.dispatch(g.programs.backdrop, (enc.npath+wgSize-1)/wgSize, 1, 1) // No barrier needed between backdrop and binning. - g.ctx.BindProgram(g.programs.binning) - g.ctx.DispatchCompute((enc.npath+wgSize-1)/wgSize, 1, 1) - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.coarse) - g.ctx.DispatchCompute(widthInBins, heightInBins, 1) - g.ctx.MemoryBarrier() - g.ctx.BindProgram(g.programs.kernel4) - g.ctx.DispatchCompute(tileDims.X, tileDims.Y, 1) - g.ctx.MemoryBarrier() + g.dispatch(g.programs.binning, (enc.npath+wgSize-1)/wgSize, 1, 1) + g.memoryBarrier() + g.dispatch(g.programs.coarse, widthInBins, heightInBins, 1) + g.memoryBarrier() + g.downloadMaterials() + g.dispatch(g.programs.kernel4, tileDims.X, tileDims.Y, 1) + g.memoryBarrier() + if g.useCPU { + g.dispatcher.Sync() + } - if err := g.buffers.memory.buffer.Download(byteslice.Struct(g.memHeader)); err != nil { + if err := g.buffers.memory.download(byteslice.Struct(g.memHeader)); err != nil { if err == driver.ErrContentLost { continue } @@ -1086,12 +1183,16 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error { } switch errCode := g.memHeader.mem_error; errCode { case memNoError: + if g.useCPU { + w, h := tileDims.X*tileWidthPx, tileDims.Y*tileHeightPx + dst.Upload(image.Pt(0, 0), image.Pt(w, h), cpuDst.Data(), stride) + } return nil case memMallocFailed: // Resize memory and try again. realloced = true sz := g.buffers.memory.size * 15 / 10 - if err := g.buffers.memory.ensureCapacity(g.ctx, driver.BufferBindingShaderStorage, sz); err != nil { + if err := g.buffers.memory.ensureCapacity(g.useCPU, g.ctx, driver.BufferBindingShaderStorage, sz); err != nil { return err } continue @@ -1101,6 +1202,48 @@ func (g *compute) render(dst driver.Texture, tileDims image.Point) error { } } +func (g *compute) downloadMaterials() { + m := &g.materials + if !g.useCPU || len(m.regions) == 0 { + return + } + copyFBO := m.fbo + data := m.cpuTex.Data() + for _, r := range m.regions { + dims := r.Size() + if n := dims.X * dims.Y * 4; n > len(m.scratch) { + m.scratch = make([]byte, n) + } + copyFBO.ReadPixels(r, m.scratch) + stride := m.packer.maxDim * 4 + col := r.Min.X * 4 + row := stride * r.Min.Y + off := col + row + w := dims.X * 4 + for y := 0; y < dims.Y; y++ { + copy(data[off:off+w], m.scratch[y*dims.X*4:]) + off += stride + } + } +} + +func (g *compute) memoryBarrier() { + if !g.useCPU { + g.ctx.MemoryBarrier() + } else { + g.dispatcher.Barrier() + } +} + +func (g *compute) dispatch(p computeProgram, x, y, z int) { + if !g.useCPU { + g.ctx.BindProgram(p.prog) + g.ctx.DispatchCompute(x, y, z) + } else { + g.dispatcher.Dispatch(p.progInfo, p.descriptors, x, y, z) + } +} + // zeros returns a byte slice with size bytes of zeros. func (g *compute) zeros(size int) []byte { if cap(g.zeroSlice) < size { @@ -1109,7 +1252,7 @@ func (g *compute) zeros(size int) []byte { return g.zeroSlice[:size] } -func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error { +func (a *layerAtlas) ensureSize(useCPU bool, ctx driver.Device, size image.Point) error { if a.size.X >= size.X && a.size.Y >= size.Y { return nil } @@ -1122,6 +1265,8 @@ func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error { a.image.Release() a.image = nil } + a.cpuImage.Free() + img, err := ctx.NewTexture(driver.TextureFormatRGBA8, size.X, size.Y, driver.FilterNearest, driver.FilterNearest, @@ -1136,29 +1281,35 @@ func (a *layerAtlas) ensureSize(ctx driver.Device, size image.Point) error { } a.fbo = fbo a.image = img + if useCPU { + a.cpuImage = cpu.NewImageRGBA(size.X, size.Y) + } a.size = size return nil } func (g *compute) Release() { + if g.useCPU { + g.dispatcher.Stop() + } type resource interface { Release() } res := []resource{ - g.programs.elements, - g.programs.tileAlloc, - g.programs.pathCoarse, - g.programs.backdrop, - g.programs.binning, - g.programs.coarse, - g.programs.kernel4, + &g.programs.elements, + &g.programs.tileAlloc, + &g.programs.pathCoarse, + &g.programs.backdrop, + &g.programs.binning, + &g.programs.coarse, + &g.programs.kernel4, g.output.blitProg, &g.output.buffer, g.output.uniBuf, &g.buffers.scene, &g.buffers.state, &g.buffers.memory, - g.buffers.config, + &g.buffers.config, g.images.tex, g.materials.layout, g.materials.prog, @@ -1168,6 +1319,7 @@ func (g *compute) Release() { g.materials.uniBuf, g.timers.t, } + g.materials.cpuTex.Free() for _, r := range res { if r != nil { r.Release() @@ -1180,48 +1332,82 @@ func (g *compute) Release() { if a.image != nil { a.image.Release() } + a.cpuImage.Free() } *g = compute{} } func (g *compute) bindBuffers() { - bindStorageBuffers(g.programs.elements, g.buffers.memory.buffer, g.buffers.config, g.buffers.scene.buffer, g.buffers.state.buffer) - bindStorageBuffers(g.programs.tileAlloc, g.buffers.memory.buffer, g.buffers.config) - bindStorageBuffers(g.programs.pathCoarse, g.buffers.memory.buffer, g.buffers.config) - bindStorageBuffers(g.programs.backdrop, g.buffers.memory.buffer, g.buffers.config) - bindStorageBuffers(g.programs.binning, g.buffers.memory.buffer, g.buffers.config) - bindStorageBuffers(g.programs.coarse, g.buffers.memory.buffer, g.buffers.config) - bindStorageBuffers(g.programs.kernel4, g.buffers.memory.buffer, g.buffers.config) + g.bindStorageBuffers(g.programs.elements, g.buffers.memory, g.buffers.config, g.buffers.scene, g.buffers.state) + g.bindStorageBuffers(g.programs.tileAlloc, g.buffers.memory, g.buffers.config) + g.bindStorageBuffers(g.programs.pathCoarse, g.buffers.memory, g.buffers.config) + g.bindStorageBuffers(g.programs.backdrop, g.buffers.memory, g.buffers.config) + g.bindStorageBuffers(g.programs.binning, g.buffers.memory, g.buffers.config) + g.bindStorageBuffers(g.programs.coarse, g.buffers.memory, g.buffers.config) + g.bindStorageBuffers(g.programs.kernel4, g.buffers.memory, g.buffers.config) +} + +func (p *computeProgram) Release() { + if p.prog != nil { + p.prog.Release() + } + *p = computeProgram{} } func (b *sizedBuffer) Release() { if b.buffer == nil { return } - b.buffer.Release() + b.cpuBuf.Free() *b = sizedBuffer{} } -func (b *sizedBuffer) ensureCapacity(ctx driver.Device, binding driver.BufferBinding, size int) error { +func (b *sizedBuffer) ensureCapacity(useCPU bool, ctx driver.Device, binding driver.BufferBinding, size int) error { if b.size >= size { return nil } if b.buffer != nil { b.Release() } - buf, err := ctx.NewBuffer(binding, size) - if err != nil { - return err + b.cpuBuf.Free() + if !useCPU { + buf, err := ctx.NewBuffer(binding, size) + if err != nil { + return err + } + b.buffer = buf + } else { + b.cpuBuf = cpu.NewBuffer(size) } - b.buffer = buf b.size = size return nil } -func bindStorageBuffers(prog driver.Program, buffers ...driver.Buffer) { +func (b *sizedBuffer) download(data []byte) error { + if b.buffer != nil { + return b.buffer.Download(data) + } else { + copy(data, b.cpuBuf.Data()) + return nil + } +} + +func (b *sizedBuffer) upload(data []byte) { + if b.buffer != nil { + b.buffer.Upload(data) + } else { + copy(b.cpuBuf.Data(), data) + } +} + +func (g *compute) bindStorageBuffers(prog computeProgram, buffers ...sizedBuffer) { for i, buf := range buffers { - prog.SetStorageBuffer(i, buf) + if !g.useCPU { + prog.prog.SetStorageBuffer(i, buf.buffer) + } else { + *prog.buffers[i] = buf.cpuBuf + } } } diff --git a/gpu/cpu.go b/gpu/cpu.go new file mode 100644 index 00000000..3e6e714d --- /dev/null +++ b/gpu/cpu.go @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: Unlicense OR MIT + +package gpu + +import ( + "runtime" + "unsafe" + + "gioui.org/cpu" +) + +const supportsCPUCompute = runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" || runtime.GOARCH == "arm" + +// This file contains code specific to running compute shaders on the CPU. + +// dispatcher dispatches CPU compute programs across multiple goroutines. +type dispatcher struct { + // done is notified when a worker completes its work slice. + done chan struct{} + // work receives work slice indices. It is closed when the dispatcher is released. + work chan work + // dispatch receives compute jobs, which is then split among workers. + dispatch chan dispatch + // sync receives notification when a Sync completes. + sync chan struct{} +} + +type work struct { + ctx *cpu.DispatchContext + index int +} + +type dispatch struct { + _type jobType + program *cpu.ProgramInfo + descSet unsafe.Pointer + x, y, z int +} + +type jobType uint8 + +const ( + jobDispatch jobType = iota + jobBarrier + jobSync +) + +func newDispatcher(workers int) *dispatcher { + d := &dispatcher{ + work: make(chan work, workers), + done: make(chan struct{}, workers), + // Leave some room to avoid blocking calls to Dispatch. + dispatch: make(chan dispatch, 20), + sync: make(chan struct{}), + } + for i := 0; i < workers; i++ { + go d.worker() + } + go d.dispatcher() + return d +} + +func (d *dispatcher) dispatcher() { + defer close(d.work) + var free []*cpu.DispatchContext + defer func() { + for _, ctx := range free { + ctx.Free() + } + }() + var used []*cpu.DispatchContext + for job := range d.dispatch { + switch job._type { + case jobDispatch: + if len(free) == 0 { + free = append(free, cpu.NewDispatchContext()) + } + ctx := free[len(free)-1] + free = free[:len(free)-1] + used = append(used, ctx) + ctx.Prepare(cap(d.work), job.program, job.descSet, job.x, job.y, job.z) + for i := 0; i < cap(d.work); i++ { + d.work <- work{ + ctx: ctx, + index: i, + } + } + case jobBarrier: + // Wait for all outstanding dispatches to complete. + for i := 0; i < len(used)*cap(d.work); i++ { + <-d.done + } + free = append(free, used...) + used = used[:0] + case jobSync: + d.sync <- struct{}{} + } + } +} + +func (d *dispatcher) worker() { + thread := cpu.NewThreadContext() + defer thread.Free() + for w := range d.work { + w.ctx.Dispatch(w.index, thread) + d.done <- struct{}{} + } +} + +func (d *dispatcher) Barrier() { + d.dispatch <- dispatch{_type: jobBarrier} +} + +func (d *dispatcher) Sync() { + d.dispatch <- dispatch{_type: jobSync} + <-d.sync +} + +func (d *dispatcher) Dispatch(program *cpu.ProgramInfo, descSet unsafe.Pointer, x, y, z int) { + d.dispatch <- dispatch{ + _type: jobDispatch, + program: program, + descSet: descSet, + x: x, + y: y, + z: z, + } +} + +func (d *dispatcher) Stop() { + close(d.dispatch) +} diff --git a/gpu/gpu.go b/gpu/gpu.go index fbe9bebc..f7673130 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -9,7 +9,6 @@ package gpu import ( "encoding/binary" - "errors" "fmt" "image" "image/color" @@ -364,11 +363,8 @@ func New(api API) (GPU, error) { switch { case !forceCompute && feats.Has(driver.FeatureFloatRenderTargets): return newGPU(d) - case feats.Has(driver.FeatureCompute): - return newCompute(d) - default: - return nil, errors.New("gpu: no support for float render targets nor compute") } + return newCompute(d) } func newGPU(ctx driver.Device) (*gpu, error) {