123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246 |
- package text
- import (
- "fmt"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- type tokenKind uint8
- const (
- tokenStr tokenKind = iota
- tokenComma
- tokenEOF
- )
- type token struct {
- kind tokenKind
- value string
- }
- func (t token) String() string {
- switch t.kind {
- case tokenStr:
- return t.value
- case tokenComma:
- return ","
- case tokenEOF:
- return "EOF"
- default:
- return "unknown"
- }
- }
- type lexState func(*lexer) lexState
- func lexText(l *lexer) lexState {
- for {
- switch r := l.next(); {
- case r == -1:
- l.ignore()
- l.emit(tokenEOF)
- return nil
- case unicode.IsSpace(r):
- continue
- case r == ',':
- l.ignore()
- l.emit(tokenComma)
- case r == '"':
- l.ignore()
- return lexDquote
- case r == '\'':
- l.ignore()
- return lexSquote
- default:
- return lexBareStr
- }
- }
- }
- func lexBareStr(l *lexer) lexState {
- defer l.emitProcessed(tokenStr, func(s string) (string, error) {
- return strings.TrimSpace(s), nil
- })
- for {
- if strings.HasPrefix(l.input[l.pos:], `,`) {
- return lexText
- }
- switch r := l.next(); {
- case r == -1:
- return lexText
- }
- }
- }
- func lexDquote(l *lexer) lexState {
- return lexQuote(l, `"`)
- }
- func lexSquote(l *lexer) lexState {
- return lexQuote(l, `'`)
- }
- func unescape(s string, quote rune) (string, error) {
- var b strings.Builder
- hitNonSpace := false
- var wb strings.Builder
- for i := 0; i < len(s); {
- r, sz := utf8.DecodeRuneInString(s[i:])
- i += sz
- if unicode.IsSpace(r) {
- if !hitNonSpace {
- continue
- }
- wb.WriteRune(r)
- continue
- }
- hitNonSpace = true
- // If we get here, we're not looking at whitespace.
- // Insert any buffered up whitespace characters from
- // the gap between words.
- b.WriteString(wb.String())
- wb.Reset()
- if r == '\\' {
- r, sz := utf8.DecodeRuneInString(s[i:])
- i += sz
- switch r {
- case '\\', quote:
- b.WriteRune(r)
- default:
- return "", fmt.Errorf("illegal escape sequence \\%c", r)
- }
- } else {
- b.WriteRune(r)
- }
- }
- return b.String(), nil
- }
- func lexQuote(l *lexer, mark string) lexState {
- escaping := false
- for {
- if isQuote := strings.HasPrefix(l.input[l.pos:], mark); isQuote && !escaping {
- err := l.emitProcessed(tokenStr, func(s string) (string, error) {
- return unescape(s, []rune(mark)[0])
- })
- if err != nil {
- l.err = err
- return nil
- }
- l.next()
- l.ignore()
- return lexText
- }
- escaped := escaping
- switch r := l.next(); {
- case r == -1:
- l.err = fmt.Errorf("unexpected EOF while parsing %s-quoted family", mark)
- return lexText
- case r == '\\':
- if !escaped {
- escaping = true
- }
- }
- if escaped {
- escaping = false
- }
- }
- }
- type lexer struct {
- input string
- pos int
- tokens []token
- err error
- }
- func (l *lexer) ignore() {
- l.input = l.input[l.pos:]
- l.pos = 0
- }
- // next decodes the next rune in the input and returns it.
- func (l *lexer) next() int32 {
- if l.pos >= len(l.input) {
- return -1
- }
- r, w := utf8.DecodeRuneInString(l.input[l.pos:])
- l.pos += w
- return r
- }
- // emit adds a token of the given kind.
- func (l *lexer) emit(t tokenKind) {
- l.emitProcessed(t, func(s string) (string, error) { return s, nil })
- }
- // emitProcessed adds a token of the given kind, but transforms its value
- // with the provided closure first.
- func (l *lexer) emitProcessed(t tokenKind, f func(string) (string, error)) error {
- val, err := f(l.input[:l.pos])
- l.tokens = append(l.tokens, token{
- kind: t,
- value: val,
- })
- l.ignore()
- return err
- }
- // run executes the lexer on the given input.
- func (l *lexer) run(input string) ([]token, error) {
- l.input = input
- l.tokens = l.tokens[:0]
- l.pos = 0
- for state := lexText; state != nil; {
- state = state(l)
- }
- return l.tokens, l.err
- }
- // parser implements a simple recursive descent parser for font family fallback
- // expressions.
- type parser struct {
- faces []string
- lexer lexer
- tokens []token
- }
- // parse the provided rule and return the extracted font families. The returned families
- // are valid only until the next call to parse. If parsing fails, an error describing the
- // failure is returned instead.
- func (p *parser) parse(rule string) ([]string, error) {
- var err error
- p.tokens, err = p.lexer.run(rule)
- if err != nil {
- return nil, err
- }
- p.faces = p.faces[:0]
- return p.faces, p.parseList()
- }
- // parse implements the production:
- //
- // LIST ::= <FACE> <COMMA> <LIST> | <FACE>
- func (p *parser) parseList() error {
- if len(p.tokens) < 0 {
- return fmt.Errorf("expected family name, got EOF")
- }
- if head := p.tokens[0]; head.kind != tokenStr {
- return fmt.Errorf("expected family name, got %s", head)
- } else {
- p.faces = append(p.faces, head.value)
- p.tokens = p.tokens[1:]
- }
- switch head := p.tokens[0]; head.kind {
- case tokenEOF:
- return nil
- case tokenComma:
- p.tokens = p.tokens[1:]
- return p.parseList()
- default:
- return fmt.Errorf("unexpected token %s", head)
- }
- }
|