diff --git a/README.md b/README.md index 9c52da3..1123ee4 100644 --- a/README.md +++ b/README.md @@ -88,19 +88,19 @@ that these should be used such that they do not interfere with shell quoting. Commonly found characters are mapped onto often used escaped sequences. These can be used in quoted strings mostly the same way one would use them in a TOML file though the specification for the TOML language advises against the use of -funky keys unless there is a good reason to use them. Tq does not support -Unicode escape sequences in quoted strings as of today, but there are plans to -add it in the future. +funky keys unless there is a good reason to use them. ```txt -\b - backspace -\t - tab -\n - linefeed -\f - form feed -\r - carriage return -\" - double quote -\' - single quote -\\ - backslash +\b - backspace +\t - tab +\n - linefeed +\f - form feed +\r - carriage return +\" - double quote +\' - single quote +\\ - backslash +\uhhhh - short 16-bit hexadecimal form +\Uhhhhhhhh - long 32-bit hexadecimal form ``` diff --git a/internal/lexer/lexer.go b/internal/lexer/lexer.go index e37fac5..7f81945 100644 --- a/internal/lexer/lexer.go +++ b/internal/lexer/lexer.go @@ -169,7 +169,6 @@ func (l *Lexer) scanBareString() bool { } l.setToken(String, start, l.offset) return true - } func (l *Lexer) scanString() bool { diff --git a/internal/lexer/token.go b/internal/lexer/token.go index 90e11d8..6758936 100644 --- a/internal/lexer/token.go +++ b/internal/lexer/token.go @@ -1,6 +1,7 @@ package lexer import ( + "strconv" "strings" "github.com/mdm-code/scanner" @@ -87,19 +88,31 @@ func (t Token) reprString() string { } chars := make([]string, 0, size) for head != end { - token := (*t.Buffer)[head] // NOTE: For quoted strings, check if the current token initiates an // escape sequence and there is at least a single token left to look up // followed by the terminating quote character. Bare strings may not - // contain escape sequence characters. - if token.Rune == '\\' && head+2 != end { + // contain escape sequence characters, because forward slash is a + // disallowed character in bare strings. + token := (*t.Buffer)[head] + if token.Rune == '\\' && head+1 != end { v, ok := escapeSequenceMap[(*t.Buffer)[head+1].Rune] if ok { - token = (*t.Buffer)[head] head += 2 chars = append(chars, v) continue } + if (*t.Buffer)[head+1].Rune == 'u' && head+5 != end { + char := t.parseUnicode(head, 2, 6) + head += 6 + chars = append(chars, char) + continue + } + if (*t.Buffer)[head+1].Rune == 'U' && head+9 != end { + char := t.parseUnicode(head, 2, 10) + head += 10 + chars = append(chars, char) + continue + } } chars = append(chars, string(token.Rune)) head++ @@ -110,6 +123,18 @@ func (t Token) reprString() string { return strings.Join(chars, "") } +func (t Token) parseUnicode(head, start, end int) string { + size := end - start + rr := make([]rune, 0, size) + for _, t := range (*t.Buffer)[head+start : head+end] { + rr = append(rr, t.Rune) + } + i, _ := strconv.ParseInt(string(rr), 16, 32) + r := rune(i) // NOTE: Make sure it fits into rune/int32. + result := string(r) + return result +} + func (t Token) reprDefault() string { end := t.End size := t.End - t.Start diff --git a/internal/lexer/token_test.go b/internal/lexer/token_test.go index fd5d55e..3d02cc6 100644 --- a/internal/lexer/token_test.go +++ b/internal/lexer/token_test.go @@ -143,6 +143,64 @@ func TestLexeme(t *testing.T) { }, want: "foo\"", }, + { + name: "escaped-unicode-short", + token: Token{ + Buffer: &[]scanner.Token{ + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'u'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'B'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'F'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'u'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'c'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + }, + Type: String, + Start: 0, + End: 14, + }, + want: "タハ", + }, + { + name: "escaped-unicode-long", + token: Token{ + Buffer: &[]scanner.Token{ + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'U'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'F'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '6'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'U'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '6'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '4'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + }, + Type: String, + Start: 0, + End: 22, + }, + want: "😱🙏", + }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { diff --git a/internal/parser/parser.go b/internal/parser/parser.go index fe3d248..af5be84 100644 --- a/internal/parser/parser.go +++ b/internal/parser/parser.go @@ -171,10 +171,7 @@ func (p *Parser) advance() lexer.Token { } func (p *Parser) isAtEnd() bool { - if p.current > len(p.buffer)-1 { - return true - } - return false + return p.current > len(p.buffer)-1 } func (p *Parser) previous() lexer.Token {