Files
gio-patched/text/family_parser.go
T
Chris Waldon 6384ab6087 text: add family DSL parser
This commit adds a parser for a simple domain-specific language that
can express a comma-delimited list of font families within a string.

I chose to encode families in this way because the string can be used
as an efficient hash key in a way that a slice of families cannot. Similarly,
using a slice of families would require allocations on the caller side.

The particular format was chosen to allow lists to be written with as little
fanfare as possible. This is why quotation marks are completely optional. It's
easy to read:

  Times New Roman, Georgia, serif

Why force the user to type this (this will parse the same):

  "Times New Roman", "Georgia", "serif"

I've tried to handle edge cases exhaustively. Commas are legal within quotes.
Within a quoted string, you can escape instances of the surrounding quote with
a backslash, and can escape literal backslashes by adding another backslash.

I wrote the lexer/parser by hand, and I hope that they're both easy to understand
and (if need be) extend.

A side effect of the DSL I've chosen (and part of my reasoning for allowing both
single and double quoted strings) is that CSS font-family rules will generally be
valid font family lists in Gio. This means the syntax is already familiar to users
coming from other technologies, and that you can copy from a web-based application
to get a similar font stack in Gio.

Fixes: https://todo.sr.ht/~eliasnaur/gio/317
Signed-off-by: Chris Waldon <christopher.waldon.dev@gmail.com>
2023-07-19 10:01:51 +02:00

247 lines
4.7 KiB
Go

package text
import (
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
type tokenKind uint8
const (
tokenStr tokenKind = iota
tokenComma
tokenEOF
)
type token struct {
kind tokenKind
value string
}
func (t token) String() string {
switch t.kind {
case tokenStr:
return t.value
case tokenComma:
return ","
case tokenEOF:
return "EOF"
default:
return "unknown"
}
}
type lexState func(*lexer) lexState
func lexText(l *lexer) lexState {
for {
switch r := l.next(); {
case r == -1:
l.ignore()
l.emit(tokenEOF)
return nil
case unicode.IsSpace(r):
continue
case r == ',':
l.ignore()
l.emit(tokenComma)
case r == '"':
l.ignore()
return lexDquote
case r == '\'':
l.ignore()
return lexSquote
default:
return lexBareStr
}
}
}
func lexBareStr(l *lexer) lexState {
defer l.emitProcessed(tokenStr, func(s string) (string, error) {
return strings.TrimSpace(s), nil
})
for {
if strings.HasPrefix(l.input[l.pos:], `,`) {
return lexText
}
switch r := l.next(); {
case r == -1:
return lexText
}
}
}
func lexDquote(l *lexer) lexState {
return lexQuote(l, `"`)
}
func lexSquote(l *lexer) lexState {
return lexQuote(l, `'`)
}
func unescape(s string, quote rune) (string, error) {
var b strings.Builder
hitNonSpace := false
var wb strings.Builder
for i := 0; i < len(s); {
r, sz := utf8.DecodeRuneInString(s[i:])
i += sz
if unicode.IsSpace(r) {
if !hitNonSpace {
continue
}
wb.WriteRune(r)
continue
}
hitNonSpace = true
// If we get here, we're not looking at whitespace.
// Insert any buffered up whitespace characters from
// the gap between words.
b.WriteString(wb.String())
wb.Reset()
if r == '\\' {
r, sz := utf8.DecodeRuneInString(s[i:])
i += sz
switch r {
case '\\', quote:
b.WriteRune(r)
default:
return "", fmt.Errorf("illegal escape sequence \\%c", r)
}
} else {
b.WriteRune(r)
}
}
return b.String(), nil
}
func lexQuote(l *lexer, mark string) lexState {
escaping := false
for {
if isQuote := strings.HasPrefix(l.input[l.pos:], mark); isQuote && !escaping {
err := l.emitProcessed(tokenStr, func(s string) (string, error) {
return unescape(s, []rune(mark)[0])
})
if err != nil {
l.err = err
return nil
}
l.next()
l.ignore()
return lexText
}
escaped := escaping
switch r := l.next(); {
case r == -1:
l.err = fmt.Errorf("unexpected EOF while parsing %s-quoted family", mark)
return lexText
case r == '\\':
if !escaped {
escaping = true
}
}
if escaped {
escaping = false
}
}
}
type lexer struct {
input string
pos int
tokens []token
err error
}
func (l *lexer) ignore() {
l.input = l.input[l.pos:]
l.pos = 0
}
// next decodes the next rune in the input and returns it.
func (l *lexer) next() int32 {
if l.pos >= len(l.input) {
return -1
}
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
l.pos += w
return r
}
// emit adds a token of the given kind.
func (l *lexer) emit(t tokenKind) {
l.emitProcessed(t, func(s string) (string, error) { return s, nil })
}
// emitProcessed adds a token of the given kind, but transforms its value
// with the provided closure first.
func (l *lexer) emitProcessed(t tokenKind, f func(string) (string, error)) error {
val, err := f(l.input[:l.pos])
l.tokens = append(l.tokens, token{
kind: t,
value: val,
})
l.ignore()
return err
}
// run executes the lexer on the given input.
func (l *lexer) run(input string) ([]token, error) {
l.input = input
l.tokens = l.tokens[:0]
l.pos = 0
for state := lexText; state != nil; {
state = state(l)
}
return l.tokens, l.err
}
// parser implements a simple recursive descent parser for font family fallback
// expressions.
type parser struct {
faces []string
lexer lexer
tokens []token
}
// parse the provided rule and return the extracted font families. The returned families
// are valid only until the next call to parse. If parsing fails, an error describing the
// failure is returned instead.
func (p *parser) parse(rule string) ([]string, error) {
var err error
p.tokens, err = p.lexer.run(rule)
if err != nil {
return nil, err
}
p.faces = p.faces[:0]
return p.faces, p.parseList()
}
// parse implements the production:
//
// LIST ::= <FACE> <COMMA> <LIST> | <FACE>
func (p *parser) parseList() error {
if len(p.tokens) < 0 {
return fmt.Errorf("expected family name, got EOF")
}
if head := p.tokens[0]; head.kind != tokenStr {
return fmt.Errorf("expected family name, got %s", head)
} else {
p.faces = append(p.faces, head.value)
p.tokens = p.tokens[1:]
}
switch head := p.tokens[0]; head.kind {
case tokenEOF:
return nil
case tokenComma:
p.tokens = p.tokens[1:]
return p.parseList()
default:
return fmt.Errorf("unexpected token %s", head)
}
}