text: add family DSL parser

This commit adds a parser for a simple domain-specific language that can express a comma-delimited list of font families within a string. I chose to encode families in this way because the string can be used as an efficient hash key in a way that a slice of families cannot. Similarly, using a slice of families would require allocations on the caller side. The particular format was chosen to allow lists to be written with as little fanfare as possible. This is why quotation marks are completely optional. It's easy to read: Times New Roman, Georgia, serif Why force the user to type this (this will parse the same): "Times New Roman", "Georgia", "serif" I've tried to handle edge cases exhaustively. Commas are legal within quotes. Within a quoted string, you can escape instances of the surrounding quote with a backslash, and can escape literal backslashes by adding another backslash. I wrote the lexer/parser by hand, and I hope that they're both easy to understand and (if need be) extend. A side effect of the DSL I've chosen (and part of my reasoning for allowing both single and double quoted strings) is that CSS font-family rules will generally be valid font family lists in Gio. This means the syntax is already familiar to users coming from other technologies, and that you can copy from a web-based application to get a similar font stack in Gio. Fixes: https://todo.sr.ht/~eliasnaur/gio/317 Signed-off-by: Chris Waldon <christopher.waldon.dev@gmail.com>
2026-07-01 07:35:40 +00:00 · 2023-07-03 16:37:58 -04:00
parent 43c47f0883
commit 6384ab6087
4 changed files with 471 additions and 4 deletions
@@ -0,0 +1,246 @@
+package text
+
+import (
+	"fmt"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+type tokenKind uint8
+
+const (
+	tokenStr tokenKind = iota
+	tokenComma
+	tokenEOF
+)
+
+type token struct {
+	kind  tokenKind
+	value string
+}
+
+func (t token) String() string {
+	switch t.kind {
+	case tokenStr:
+		return t.value
+	case tokenComma:
+		return ","
+	case tokenEOF:
+		return "EOF"
+	default:
+		return "unknown"
+	}
+}
+
+type lexState func(*lexer) lexState
+
+func lexText(l *lexer) lexState {
+	for {
+		switch r := l.next(); {
+		case r == -1:
+			l.ignore()
+			l.emit(tokenEOF)
+			return nil
+		case unicode.IsSpace(r):
+			continue
+		case r == ',':
+			l.ignore()
+			l.emit(tokenComma)
+		case r == '"':
+			l.ignore()
+			return lexDquote
+		case r == '\'':
+			l.ignore()
+			return lexSquote
+		default:
+			return lexBareStr
+		}
+	}
+}
+
+func lexBareStr(l *lexer) lexState {
+	defer l.emitProcessed(tokenStr, func(s string) (string, error) {
+		return strings.TrimSpace(s), nil
+	})
+	for {
+		if strings.HasPrefix(l.input[l.pos:], `,`) {
+			return lexText
+		}
+		switch r := l.next(); {
+		case r == -1:
+			return lexText
+		}
+	}
+}
+
+func lexDquote(l *lexer) lexState {
+	return lexQuote(l, `"`)
+}
+
+func lexSquote(l *lexer) lexState {
+	return lexQuote(l, `'`)
+}
+
+func unescape(s string, quote rune) (string, error) {
+	var b strings.Builder
+	hitNonSpace := false
+	var wb strings.Builder
+	for i := 0; i < len(s); {
+		r, sz := utf8.DecodeRuneInString(s[i:])
+		i += sz
+		if unicode.IsSpace(r) {
+			if !hitNonSpace {
+				continue
+			}
+			wb.WriteRune(r)
+			continue
+		}
+		hitNonSpace = true
+		// If we get here, we're not looking at whitespace.
+		// Insert any buffered up whitespace characters from
+		// the gap between words.
+		b.WriteString(wb.String())
+		wb.Reset()
+		if r == '\\' {
+			r, sz := utf8.DecodeRuneInString(s[i:])
+			i += sz
+			switch r {
+			case '\\', quote:
+				b.WriteRune(r)
+			default:
+				return "", fmt.Errorf("illegal escape sequence \\%c", r)
+			}
+		} else {
+			b.WriteRune(r)
+		}
+	}
+	return b.String(), nil
+}
+
+func lexQuote(l *lexer, mark string) lexState {
+	escaping := false
+	for {
+		if isQuote := strings.HasPrefix(l.input[l.pos:], mark); isQuote && !escaping {
+			err := l.emitProcessed(tokenStr, func(s string) (string, error) {
+				return unescape(s, []rune(mark)[0])
+			})
+			if err != nil {
+				l.err = err
+				return nil
+			}
+			l.next()
+			l.ignore()
+			return lexText
+		}
+		escaped := escaping
+		switch r := l.next(); {
+		case r == -1:
+			l.err = fmt.Errorf("unexpected EOF while parsing %s-quoted family", mark)
+			return lexText
+		case r == '\\':
+			if !escaped {
+				escaping = true
+			}
+		}
+		if escaped {
+			escaping = false
+		}
+	}
+}
+
+type lexer struct {
+	input  string
+	pos    int
+	tokens []token
+	err    error
+}
+
+func (l *lexer) ignore() {
+	l.input = l.input[l.pos:]
+	l.pos = 0
+}
+
+// next decodes the next rune in the input and returns it.
+func (l *lexer) next() int32 {
+	if l.pos >= len(l.input) {
+		return -1
+	}
+	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
+	l.pos += w
+	return r
+}
+
+// emit adds a token of the given kind.
+func (l *lexer) emit(t tokenKind) {
+	l.emitProcessed(t, func(s string) (string, error) { return s, nil })
+}
+
+// emitProcessed adds a token of the given kind, but transforms its value
+// with the provided closure first.
+func (l *lexer) emitProcessed(t tokenKind, f func(string) (string, error)) error {
+	val, err := f(l.input[:l.pos])
+	l.tokens = append(l.tokens, token{
+		kind:  t,
+		value: val,
+	})
+	l.ignore()
+	return err
+}
+
+// run executes the lexer on the given input.
+func (l *lexer) run(input string) ([]token, error) {
+	l.input = input
+	l.tokens = l.tokens[:0]
+	l.pos = 0
+	for state := lexText; state != nil; {
+		state = state(l)
+	}
+	return l.tokens, l.err
+}
+
+// parser implements a simple recursive descent parser for font family fallback
+// expressions.
+type parser struct {
+	faces  []string
+	lexer  lexer
+	tokens []token
+}
+
+// parse the provided rule and return the extracted font families. The returned families
+// are valid only until the next call to parse. If parsing fails, an error describing the
+// failure is returned instead.
+func (p *parser) parse(rule string) ([]string, error) {
+	var err error
+	p.tokens, err = p.lexer.run(rule)
+	if err != nil {
+		return nil, err
+	}
+	p.faces = p.faces[:0]
+	return p.faces, p.parseList()
+}
+
+// parse implements the production:
+//
+//	LIST ::= <FACE> <COMMA> <LIST> | <FACE>
+func (p *parser) parseList() error {
+	if len(p.tokens) < 0 {
+		return fmt.Errorf("expected family name, got EOF")
+	}
+	if head := p.tokens[0]; head.kind != tokenStr {
+		return fmt.Errorf("expected family name, got %s", head)
+	} else {
+		p.faces = append(p.faces, head.value)
+		p.tokens = p.tokens[1:]
+	}
+
+	switch head := p.tokens[0]; head.kind {
+	case tokenEOF:
+		return nil
+	case tokenComma:
+		p.tokens = p.tokens[1:]
+		return p.parseList()
+	default:
+		return fmt.Errorf("unexpected token %s", head)
+	}
+}
@@ -0,0 +1,179 @@
+package text
+
+import (
+	"testing"
+
+	"golang.org/x/exp/slices"
+)
+
+func TestParser(t *testing.T) {
+	type scenario struct {
+		variantName string
+		input       string
+	}
+	type testcase struct {
+		name      string
+		inputs    []scenario
+		expected  []string
+		shouldErr bool
+	}
+
+	for _, tc := range []testcase{
+		{
+			name: "empty",
+			inputs: []scenario{
+				{
+					variantName: "",
+				},
+			},
+			shouldErr: true,
+		},
+		{
+			name: "comma failure",
+			inputs: []scenario{
+				{
+					variantName: "bare single",
+					input:       ",",
+				},
+				{
+					variantName: "bare multiple",
+					input:       ",, ,,",
+				},
+			},
+			shouldErr: true,
+		},
+		{
+			name: "comma success",
+			inputs: []scenario{
+				{
+					variantName: "squote",
+					input:       "','",
+				},
+				{
+					variantName: "dquote",
+					input:       `","`,
+				},
+			},
+			expected: []string{","},
+		},
+		{
+			name: "comma success multiple",
+			inputs: []scenario{
+				{
+					variantName: "squote",
+					input:       "',,', ',,'",
+				},
+				{
+					variantName: "dquote",
+					input:       `",,", ",,"`,
+				},
+			},
+			expected: []string{",,", ",,"},
+		},
+		{
+			name: "backslashes",
+			inputs: []scenario{
+				{
+					variantName: "bare",
+					input:       `\font\\`,
+				},
+				{
+					variantName: "dquote",
+					input:       `"\\font\\\\"`,
+				},
+				{
+					variantName: "squote",
+					input:       `'\\font\\\\'`,
+				},
+			},
+			expected: []string{`\font\\`},
+		},
+		{
+			name: "invalid backslashes",
+			inputs: []scenario{
+				{
+					variantName: "dquote",
+					input:       `"\\""`,
+				},
+				{
+					variantName: "squote",
+					input:       `'\\''`,
+				},
+			},
+			shouldErr: true,
+		},
+		{
+			name: "too many quotes",
+			inputs: []scenario{
+				{
+					variantName: "dquote",
+					input:       `"""`,
+				},
+				{
+					variantName: "squote",
+					input:       `'''`,
+				},
+			},
+			shouldErr: true,
+		},
+		{
+			name: "serif serif's serif\"s",
+			inputs: []scenario{
+				{
+					variantName: "bare",
+					input:       `serif, serif's, serif"s`,
+				},
+				{
+					variantName: "squote",
+					input:       `'serif', 'serif\'s', 'serif"s'`,
+				},
+				{
+					variantName: "dquote",
+					input:       `"serif", "serif's", "serif\"s"`,
+				},
+			},
+			expected: []string{"serif", `serif's`, `serif"s`},
+		},
+		{
+			name: "complex list",
+			inputs: []scenario{
+				{
+					variantName: "bare",
+					input:       `Times New Roman, Georgia Common, Helvetica Neue, serif`,
+				},
+				{
+					variantName: "squote",
+					input:       `'Times New Roman', 'Georgia Common', 'Helvetica Neue', 'serif'`,
+				},
+				{
+					variantName: "dquote",
+					input:       `"Times New Roman", "Georgia Common", "Helvetica Neue", "serif"`,
+				},
+				{
+					variantName: "mixed",
+					input:       `Times New Roman, "Georgia Common", 'Helvetica Neue', "serif"`,
+				},
+				{
+					variantName: "mixed with weird spacing",
+					input:       `Times New Roman  ,"Georgia Common"              , 'Helvetica Neue' ,"serif"`,
+				},
+			},
+			expected: []string{"Times New Roman", "Georgia Common", "Helvetica Neue", "serif"},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			var p parser
+			for _, scen := range tc.inputs {
+				t.Run(scen.variantName, func(t *testing.T) {
+					actual, err := p.parse(scen.input)
+					if (err != nil) != tc.shouldErr {
+						t.Errorf("unexpected error state: %v", err)
+					}
+					if !slices.Equal(tc.expected, actual) {
+						t.Errorf("expected\n%q\ngot\n%q", tc.expected, actual)
+					}
+				})
+			}
+		})
+	}
+}
@@ -168,6 +168,7 @@ type shaperImpl struct {
 	logger       interface {
 		Printf(format string, args ...any)
 	}
+	parser parser

 	// Shaping and wrapping state.
 	shaper        shaping.HarfbuzzShaper
@@ -442,8 +443,17 @@ func (s *shaperImpl) shapeAndWrapText(params Parameters, txt []rune) (_ []shapin
 		TextContinues:      params.forceTruncate,
 		BreakPolicy:        wrapPolicyToGoText(params.WrapPolicy),
 	}
+	families := s.defaultFaces
+	if params.Font.Typeface != "" {
+		parsed, err := s.parser.parse(string(params.Font.Typeface))
+		if err != nil {
+			s.logger.Printf("Unable to parse typeface %q: %v", params.Font.Typeface, err)
+		} else {
+			families = parsed
+		}
+	}
 	s.fontMap.SetQuery(fontscan.Query{
-		Families: []string{string(params.Font.Typeface)},
+		Families: families,
 		Aspect:   opentype.FontToDescription(params.Font).Aspect,
 	})
 	if wc.TruncateAfterLines > 0 {