mirror of
https://git.sr.ht/~eliasnaur/gio
synced 2026-07-01 07:35:40 +00:00
6384ab6087
This commit adds a parser for a simple domain-specific language that can express a comma-delimited list of font families within a string. I chose to encode families in this way because the string can be used as an efficient hash key in a way that a slice of families cannot. Similarly, using a slice of families would require allocations on the caller side. The particular format was chosen to allow lists to be written with as little fanfare as possible. This is why quotation marks are completely optional. It's easy to read: Times New Roman, Georgia, serif Why force the user to type this (this will parse the same): "Times New Roman", "Georgia", "serif" I've tried to handle edge cases exhaustively. Commas are legal within quotes. Within a quoted string, you can escape instances of the surrounding quote with a backslash, and can escape literal backslashes by adding another backslash. I wrote the lexer/parser by hand, and I hope that they're both easy to understand and (if need be) extend. A side effect of the DSL I've chosen (and part of my reasoning for allowing both single and double quoted strings) is that CSS font-family rules will generally be valid font family lists in Gio. This means the syntax is already familiar to users coming from other technologies, and that you can copy from a web-based application to get a similar font stack in Gio. Fixes: https://todo.sr.ht/~eliasnaur/gio/317 Signed-off-by: Chris Waldon <christopher.waldon.dev@gmail.com>
247 lines
4.7 KiB
Go
247 lines
4.7 KiB
Go
package text
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type tokenKind uint8
|
|
|
|
const (
|
|
tokenStr tokenKind = iota
|
|
tokenComma
|
|
tokenEOF
|
|
)
|
|
|
|
type token struct {
|
|
kind tokenKind
|
|
value string
|
|
}
|
|
|
|
func (t token) String() string {
|
|
switch t.kind {
|
|
case tokenStr:
|
|
return t.value
|
|
case tokenComma:
|
|
return ","
|
|
case tokenEOF:
|
|
return "EOF"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
type lexState func(*lexer) lexState
|
|
|
|
func lexText(l *lexer) lexState {
|
|
for {
|
|
switch r := l.next(); {
|
|
case r == -1:
|
|
l.ignore()
|
|
l.emit(tokenEOF)
|
|
return nil
|
|
case unicode.IsSpace(r):
|
|
continue
|
|
case r == ',':
|
|
l.ignore()
|
|
l.emit(tokenComma)
|
|
case r == '"':
|
|
l.ignore()
|
|
return lexDquote
|
|
case r == '\'':
|
|
l.ignore()
|
|
return lexSquote
|
|
default:
|
|
return lexBareStr
|
|
}
|
|
}
|
|
}
|
|
|
|
func lexBareStr(l *lexer) lexState {
|
|
defer l.emitProcessed(tokenStr, func(s string) (string, error) {
|
|
return strings.TrimSpace(s), nil
|
|
})
|
|
for {
|
|
if strings.HasPrefix(l.input[l.pos:], `,`) {
|
|
return lexText
|
|
}
|
|
switch r := l.next(); {
|
|
case r == -1:
|
|
return lexText
|
|
}
|
|
}
|
|
}
|
|
|
|
func lexDquote(l *lexer) lexState {
|
|
return lexQuote(l, `"`)
|
|
}
|
|
|
|
func lexSquote(l *lexer) lexState {
|
|
return lexQuote(l, `'`)
|
|
}
|
|
|
|
func unescape(s string, quote rune) (string, error) {
|
|
var b strings.Builder
|
|
hitNonSpace := false
|
|
var wb strings.Builder
|
|
for i := 0; i < len(s); {
|
|
r, sz := utf8.DecodeRuneInString(s[i:])
|
|
i += sz
|
|
if unicode.IsSpace(r) {
|
|
if !hitNonSpace {
|
|
continue
|
|
}
|
|
wb.WriteRune(r)
|
|
continue
|
|
}
|
|
hitNonSpace = true
|
|
// If we get here, we're not looking at whitespace.
|
|
// Insert any buffered up whitespace characters from
|
|
// the gap between words.
|
|
b.WriteString(wb.String())
|
|
wb.Reset()
|
|
if r == '\\' {
|
|
r, sz := utf8.DecodeRuneInString(s[i:])
|
|
i += sz
|
|
switch r {
|
|
case '\\', quote:
|
|
b.WriteRune(r)
|
|
default:
|
|
return "", fmt.Errorf("illegal escape sequence \\%c", r)
|
|
}
|
|
} else {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String(), nil
|
|
}
|
|
|
|
func lexQuote(l *lexer, mark string) lexState {
|
|
escaping := false
|
|
for {
|
|
if isQuote := strings.HasPrefix(l.input[l.pos:], mark); isQuote && !escaping {
|
|
err := l.emitProcessed(tokenStr, func(s string) (string, error) {
|
|
return unescape(s, []rune(mark)[0])
|
|
})
|
|
if err != nil {
|
|
l.err = err
|
|
return nil
|
|
}
|
|
l.next()
|
|
l.ignore()
|
|
return lexText
|
|
}
|
|
escaped := escaping
|
|
switch r := l.next(); {
|
|
case r == -1:
|
|
l.err = fmt.Errorf("unexpected EOF while parsing %s-quoted family", mark)
|
|
return lexText
|
|
case r == '\\':
|
|
if !escaped {
|
|
escaping = true
|
|
}
|
|
}
|
|
if escaped {
|
|
escaping = false
|
|
}
|
|
}
|
|
}
|
|
|
|
type lexer struct {
|
|
input string
|
|
pos int
|
|
tokens []token
|
|
err error
|
|
}
|
|
|
|
func (l *lexer) ignore() {
|
|
l.input = l.input[l.pos:]
|
|
l.pos = 0
|
|
}
|
|
|
|
// next decodes the next rune in the input and returns it.
|
|
func (l *lexer) next() int32 {
|
|
if l.pos >= len(l.input) {
|
|
return -1
|
|
}
|
|
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
|
|
l.pos += w
|
|
return r
|
|
}
|
|
|
|
// emit adds a token of the given kind.
|
|
func (l *lexer) emit(t tokenKind) {
|
|
l.emitProcessed(t, func(s string) (string, error) { return s, nil })
|
|
}
|
|
|
|
// emitProcessed adds a token of the given kind, but transforms its value
|
|
// with the provided closure first.
|
|
func (l *lexer) emitProcessed(t tokenKind, f func(string) (string, error)) error {
|
|
val, err := f(l.input[:l.pos])
|
|
l.tokens = append(l.tokens, token{
|
|
kind: t,
|
|
value: val,
|
|
})
|
|
l.ignore()
|
|
return err
|
|
}
|
|
|
|
// run executes the lexer on the given input.
|
|
func (l *lexer) run(input string) ([]token, error) {
|
|
l.input = input
|
|
l.tokens = l.tokens[:0]
|
|
l.pos = 0
|
|
for state := lexText; state != nil; {
|
|
state = state(l)
|
|
}
|
|
return l.tokens, l.err
|
|
}
|
|
|
|
// parser implements a simple recursive descent parser for font family fallback
|
|
// expressions.
|
|
type parser struct {
|
|
faces []string
|
|
lexer lexer
|
|
tokens []token
|
|
}
|
|
|
|
// parse the provided rule and return the extracted font families. The returned families
|
|
// are valid only until the next call to parse. If parsing fails, an error describing the
|
|
// failure is returned instead.
|
|
func (p *parser) parse(rule string) ([]string, error) {
|
|
var err error
|
|
p.tokens, err = p.lexer.run(rule)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
p.faces = p.faces[:0]
|
|
return p.faces, p.parseList()
|
|
}
|
|
|
|
// parse implements the production:
|
|
//
|
|
// LIST ::= <FACE> <COMMA> <LIST> | <FACE>
|
|
func (p *parser) parseList() error {
|
|
if len(p.tokens) < 0 {
|
|
return fmt.Errorf("expected family name, got EOF")
|
|
}
|
|
if head := p.tokens[0]; head.kind != tokenStr {
|
|
return fmt.Errorf("expected family name, got %s", head)
|
|
} else {
|
|
p.faces = append(p.faces, head.value)
|
|
p.tokens = p.tokens[1:]
|
|
}
|
|
|
|
switch head := p.tokens[0]; head.kind {
|
|
case tokenEOF:
|
|
return nil
|
|
case tokenComma:
|
|
p.tokens = p.tokens[1:]
|
|
return p.parseList()
|
|
default:
|
|
return fmt.Errorf("unexpected token %s", head)
|
|
}
|
|
}
|