text: correct arabic diacritics handling

This commit fixes the association of diacritical marks with the proper script
during text segmentation, as well as fixing the visual position of diacritical
marks. The prior code inverted the Y axis positioning for diacritics, which made
them frequently overlap the glyph they were meant to appear above or below.

Signed-off-by: runitclean <runitclean@disroot.org>
This commit is contained in:
runitclean
2026-01-06 14:57:06 +00:00
committed by Chris Waldon
parent 99647591f6
commit 3af0ebb3a8
2 changed files with 105 additions and 3 deletions
+3 -3
View File
@@ -312,7 +312,7 @@ func splitByScript(inputs []shaping.Input, documentDir di.Direction, buf []shapi
r := input.Text[i]
runeScript := language.LookupScript(r)
if runeScript == language.Common || runeScript == currentInput.Script {
if runeScript == language.Common || runeScript == language.Inherited || runeScript == currentInput.Script {
continue
}
@@ -661,7 +661,7 @@ func (s *shaperImpl) Shape(pathOps *op.Ops, gs []Glyph) clip.PathSpec {
outline := glyphData
// Move to glyph position.
pos := f32.Point{
X: fixedToFloat((g.X - x) - g.Offset.X),
X: fixedToFloat((g.X - x) + g.Offset.X),
Y: -fixedToFloat(g.Offset.Y),
}
builder.Move(pos.Sub(lastPos))
@@ -761,7 +761,7 @@ func (s *shaperImpl) Bitmaps(ops *op.Ops, gs []Glyph) op.CallOp {
imgSize = bitmapData.size
}
off := op.Affine(f32.AffineId().Offset(f32.Point{
X: fixedToFloat((g.X - x) - g.Offset.X),
X: fixedToFloat((g.X - x) + g.Offset.X),
Y: fixedToFloat(g.Offset.Y + g.Bounds.Min.Y),
})).Push(ops)
cl := clip.Rect{Max: imgSize}.Push(ops)
+102
View File
@@ -10,6 +10,8 @@ import (
nsareg "eliasnaur.com/font/noto/sans/arabic/regular"
"github.com/go-text/typesetting/font"
"github.com/go-text/typesetting/shaping"
"github.com/go-text/typesetting/language"
"github.com/go-text/typesetting/di"
"golang.org/x/image/font/gofont/goregular"
"golang.org/x/image/math/fixed"
@@ -593,3 +595,103 @@ func TestGlyphIDPacking(t *testing.T) {
})
}
}
// TestArabicDiacriticClustering verifies that Arabic diacritics (which usually have
// script 'Inherited') are correctly clustered with their base Arabic letters,
// rather than being split into a separate shaping run.
func TestArabicDiacriticClustering(t *testing.T) {
tests := []struct {
name string
input []rune
wantRuns int
wantScript language.Script
wantDirection di.Direction
}{
{
name: "Arabic Letter + Diacritic",
// \u0628 => BEH
// \u0650 => KASRA (Diacritic)
input: []rune{'\u0628', '\u0650'},
wantRuns: 1,
wantScript: language.Arabic,
wantDirection: di.DirectionRTL,
},
{
name: "Arabic Word with Multiple Diacritics",
input: []rune{
'\u0628', // BEH
'\u0650', // KASRA
'\u0633', // SEEN
'\u0652', // SUKUN
'\u0645', // MEEM
'\u0650', // KASRA
},
wantRuns: 1,
wantScript: language.Arabic,
wantDirection: di.DirectionRTL,
},
{
name: "Mixed Script (CONTROL Case) #1",
// Arabic Letter + Latin Letter
// THESE MUST SPLIT TO 2.
input: []rune{'\u0628', 'A'},
wantRuns: 2,
wantScript: language.Arabic,
wantDirection: di.DirectionRTL,
},
{
name: "Mixed Script (CONTROL Case) #2",
// Arabic Letter + Diacritic + Diacritic + Latin Letter + Arabic Letter + Diacritic
// THESE MUST SPLIT TO 3.
input: []rune{'\u0628', '\u0651', '\u0650', 'A', '\u0628', '\u0650'},
wantRuns: 3,
wantScript: language.Arabic,
wantDirection: di.DirectionRTL,
},
{
name: "Mixed Script (A little 'stress' test)",
// Latin 's' + Arabic Kasra + Latin 'r' + Arabic Fatha
// this technically valid unicode!
// the diacritics should inherit "Latin"
input: []rune{'s', '\u0651', '\u0650', 'r', '\u064E'},
wantRuns: 1,
wantScript: language.Latin,
wantDirection: di.DirectionLTR,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
inputs := []shaping.Input{{
Text: tt.input,
RunStart: 0,
RunEnd: len(tt.input),
Direction: tt.wantDirection,
Script: language.Arabic,
Face: nil, // face doesn't really matter for splitting anyway
Size: fixed.I(10),
}}
got := splitByScript(inputs, tt.wantDirection, nil)
if len(got) != tt.wantRuns {
t.Fatalf("splitByScript produced %d runs, expected %d. \nRun details: %+v", len(got), tt.wantRuns, got)
}
// this is for the single-run cases
// we need to verify the integrity of the single run
// to ensure
// - the truncation didn't happen early on (when first hitting a diacritic)
// - and the right dominant script label was used
if tt.wantRuns == 1 {
run := got[0]
if run.RunEnd != len(tt.input) {
t.Errorf("Run truncated early. End = %d, expected %d", run.RunEnd, len(tt.input))
}
if run.Script != tt.wantScript {
t.Errorf("Run assigned wrong script. Got %s, expected %s", run.Script, tt.wantScript)
}
}
})
}
}