From 92cd9d13a4da2109aef2a2fb3eead5bfe9c0439a Mon Sep 17 00:00:00 2001 From: mdm-code Date: Mon, 9 Dec 2024 23:01:58 +0100 Subject: [PATCH 1/5] Simplify parser isAtEnd pointer rec. function --- internal/parser/parser.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/internal/parser/parser.go b/internal/parser/parser.go index fe3d248..af5be84 100644 --- a/internal/parser/parser.go +++ b/internal/parser/parser.go @@ -171,10 +171,7 @@ func (p *Parser) advance() lexer.Token { } func (p *Parser) isAtEnd() bool { - if p.current > len(p.buffer)-1 { - return true - } - return false + return p.current > len(p.buffer)-1 } func (p *Parser) previous() lexer.Token { From c3c1413c9539c4e7ecb0960fae44fecf21eb4aec Mon Sep 17 00:00:00 2001 From: mdm-code Date: Tue, 17 Dec 2024 00:15:29 +0100 Subject: [PATCH 2/5] Supported short and long unicode notation This is a rough implementation that works. It will be iterated over before adding to main. --- internal/lexer/lexer.go | 1 - internal/lexer/token.go | 31 +++++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/internal/lexer/lexer.go b/internal/lexer/lexer.go index e37fac5..7f81945 100644 --- a/internal/lexer/lexer.go +++ b/internal/lexer/lexer.go @@ -169,7 +169,6 @@ func (l *Lexer) scanBareString() bool { } l.setToken(String, start, l.offset) return true - } func (l *Lexer) scanString() bool { diff --git a/internal/lexer/token.go b/internal/lexer/token.go index 90e11d8..25f9402 100644 --- a/internal/lexer/token.go +++ b/internal/lexer/token.go @@ -1,6 +1,7 @@ package lexer import ( + "strconv" "strings" "github.com/mdm-code/scanner" @@ -87,19 +88,41 @@ func (t Token) reprString() string { } chars := make([]string, 0, size) for head != end { - token := (*t.Buffer)[head] // NOTE: For quoted strings, check if the current token initiates an // escape sequence and there is at least a single token left to look up // followed by the terminating quote character. Bare strings may not - // contain escape sequence characters. - if token.Rune == '\\' && head+2 != end { + // contain escape sequence characters, because forward slash is a + // disallowed character in bare strings. + token := (*t.Buffer)[head] + if token.Rune == '\\' && head+1 != end { v, ok := escapeSequenceMap[(*t.Buffer)[head+1].Rune] if ok { - token = (*t.Buffer)[head] head += 2 chars = append(chars, v) continue } + if (*t.Buffer)[head+1].Rune == 'u' && head+5 != end { + rr := []rune{} + for _, t := range (*t.Buffer)[head+2 : head+6] { + rr = append(rr, t.Rune) + } + i, _ := strconv.ParseInt(string(rr), 16, 32) + r := rune(i) + head += 6 + chars = append(chars, string(r)) + continue + } + if (*t.Buffer)[head+1].Rune == 'U' && head+9 != end { + rr := []rune{} + for _, t := range (*t.Buffer)[head+2 : head+10] { + rr = append(rr, t.Rune) + } + i, _ := strconv.ParseInt(string(rr), 16, 64) + r := rune(i) + head += 10 + chars = append(chars, string(r)) + continue + } } chars = append(chars, string(token.Rune)) head++ From f34b0e645a7971b776313492aa6e30f53b4fe31e Mon Sep 17 00:00:00 2001 From: mdm-code Date: Wed, 18 Dec 2024 00:06:42 +0100 Subject: [PATCH 3/5] Implemented unicode escape chars in user input --- internal/lexer/token.go | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/internal/lexer/token.go b/internal/lexer/token.go index 25f9402..6758936 100644 --- a/internal/lexer/token.go +++ b/internal/lexer/token.go @@ -102,25 +102,15 @@ func (t Token) reprString() string { continue } if (*t.Buffer)[head+1].Rune == 'u' && head+5 != end { - rr := []rune{} - for _, t := range (*t.Buffer)[head+2 : head+6] { - rr = append(rr, t.Rune) - } - i, _ := strconv.ParseInt(string(rr), 16, 32) - r := rune(i) + char := t.parseUnicode(head, 2, 6) head += 6 - chars = append(chars, string(r)) + chars = append(chars, char) continue } if (*t.Buffer)[head+1].Rune == 'U' && head+9 != end { - rr := []rune{} - for _, t := range (*t.Buffer)[head+2 : head+10] { - rr = append(rr, t.Rune) - } - i, _ := strconv.ParseInt(string(rr), 16, 64) - r := rune(i) + char := t.parseUnicode(head, 2, 10) head += 10 - chars = append(chars, string(r)) + chars = append(chars, char) continue } } @@ -133,6 +123,18 @@ func (t Token) reprString() string { return strings.Join(chars, "") } +func (t Token) parseUnicode(head, start, end int) string { + size := end - start + rr := make([]rune, 0, size) + for _, t := range (*t.Buffer)[head+start : head+end] { + rr = append(rr, t.Rune) + } + i, _ := strconv.ParseInt(string(rr), 16, 32) + r := rune(i) // NOTE: Make sure it fits into rune/int32. + result := string(r) + return result +} + func (t Token) reprDefault() string { end := t.End size := t.End - t.Start From 8aac0e6b43ba8fbd81928e2a2209c9d7125f6809 Mon Sep 17 00:00:00 2001 From: mdm-code Date: Wed, 18 Dec 2024 00:07:50 +0100 Subject: [PATCH 4/5] Added tests covering unicode support --- internal/lexer/token_test.go | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/internal/lexer/token_test.go b/internal/lexer/token_test.go index fd5d55e..3d02cc6 100644 --- a/internal/lexer/token_test.go +++ b/internal/lexer/token_test.go @@ -143,6 +143,64 @@ func TestLexeme(t *testing.T) { }, want: "foo\"", }, + { + name: "escaped-unicode-short", + token: Token{ + Buffer: &[]scanner.Token{ + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'u'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'B'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'F'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'u'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'c'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + }, + Type: String, + Start: 0, + End: 14, + }, + want: "タハ", + }, + { + name: "escaped-unicode-long", + token: Token{ + Buffer: &[]scanner.Token{ + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'U'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'F'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '6'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '3'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '\\'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'U'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '0'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '1'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '6'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '4'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: 'f'}, Buffer: nil}, + {Pos: scanner.Pos{Rune: '"'}, Buffer: nil}, + }, + Type: String, + Start: 0, + End: 22, + }, + want: "😱🙏", + }, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { From 83cf329069022613d1aeb8605845e0c4b992b323 Mon Sep 17 00:00:00 2001 From: mdm-code Date: Wed, 18 Dec 2024 00:12:57 +0100 Subject: [PATCH 5/5] Included unicode escape seq explanation in readme --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9c52da3..1123ee4 100644 --- a/README.md +++ b/README.md @@ -88,19 +88,19 @@ that these should be used such that they do not interfere with shell quoting. Commonly found characters are mapped onto often used escaped sequences. These can be used in quoted strings mostly the same way one would use them in a TOML file though the specification for the TOML language advises against the use of -funky keys unless there is a good reason to use them. Tq does not support -Unicode escape sequences in quoted strings as of today, but there are plans to -add it in the future. +funky keys unless there is a good reason to use them. ```txt -\b - backspace -\t - tab -\n - linefeed -\f - form feed -\r - carriage return -\" - double quote -\' - single quote -\\ - backslash +\b - backspace +\t - tab +\n - linefeed +\f - form feed +\r - carriage return +\" - double quote +\' - single quote +\\ - backslash +\uhhhh - short 16-bit hexadecimal form +\Uhhhhhhhh - long 32-bit hexadecimal form ```