diff --git a/go.mod b/go.mod index 73352a4..ccc5cb4 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,8 @@ go 1.18 require ( github.com/koykov/bytealg v1.0.7 github.com/koykov/byteconv v1.0.1 - github.com/koykov/vector v1.2.7 + github.com/koykov/simd v0.0.9 + github.com/koykov/vector v1.2.9 ) require ( @@ -13,5 +14,5 @@ require ( github.com/koykov/byteseq v1.0.2 // indirect github.com/koykov/entry v1.0.2 // indirect github.com/koykov/indirect v1.0.1 // indirect - github.com/koykov/openrt v0.0.0-20240728195600-ccc3242946f9 // indirect + golang.org/x/sys v0.30.0 // indirect ) diff --git a/go.sum b/go.sum index cc93de1..5d9690f 100644 --- a/go.sum +++ b/go.sum @@ -10,7 +10,9 @@ github.com/koykov/entry v1.0.2 h1:6mZJUt4POGQHRPJ9Iw4GyIZJi9wj2lews3yRRNHfTY4= github.com/koykov/entry v1.0.2/go.mod h1:WmCy/YM0sPb4ETL9wYY0OJwO89KJ7qeQPGOwLUG4rZU= github.com/koykov/indirect v1.0.1 h1:1veVipIWBeklFHMvzuwhL82X5eDaJzN+hPeVGRvu22Y= github.com/koykov/indirect v1.0.1/go.mod h1:2qWC0hrIHIexlKaqPA0VWEa0s2V/qxxNJv7XPncnh2I= -github.com/koykov/openrt v0.0.0-20240728195600-ccc3242946f9 h1:BZvoBH5eeWfqd74OnY4G3u81zPl52mYg2rF9I2dM0cE= -github.com/koykov/openrt v0.0.0-20240728195600-ccc3242946f9/go.mod h1:y8Xa99HTBmthCilUUW36IZJd5SP9Rb+W8S9CJaauyU8= -github.com/koykov/vector v1.2.7 h1:DmZYbqY+6pslQLuRxdn8W6XiAnWriQEqIbwAq8F5QLU= -github.com/koykov/vector v1.2.7/go.mod h1:z5OScTfTaFdv6tHFoNqyuXslw5YIkKkGORc5RiAlDKk= +github.com/koykov/simd v0.0.9 h1:ooXO/cEcIcDGmcHnPSnWRCe+8dS9z9kMUM4cc6Gwr0g= +github.com/koykov/simd v0.0.9/go.mod h1:sxZxJ0LR+ZMZ85Gg6Ujd4ABNst4bNf9ylh894fpohp8= +github.com/koykov/vector v1.2.9 h1:1yfqMvrtkjpX0Y78gRym28oTd1iu0WBdo1JCxotPlz4= +github.com/koykov/vector v1.2.9/go.mod h1:yF41B972ZGEtbHhOLza3KS5WBzHVeALV/kGThCWDTN8= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/parser.go b/parser.go index e9264b2..f7083c3 100644 --- a/parser.go +++ b/parser.go @@ -5,6 +5,7 @@ import ( "errors" "github.com/koykov/bytealg" + "github.com/koykov/simd/indexbyte" "github.com/koykov/vector" ) @@ -81,31 +82,13 @@ func (vec *Vector) parseGeneric(depth, offset int, node *vector.Node) (int, erro // Save offset of string value. node.Value().SetAddr(srcp, n).SetOffset(offset + 1) // Get index of end of string value. - e := bytealg.IndexByteAtBytes(src, '"', offset+1) + e := indexbyte.IndexAtNE(src, '"', offset+1) if e < 0 { return n, vector.ErrUnexpEOS } node.Value().SetBit(flagEscape, true) // Always mark string as escaped to avoid double indexing. - if src[e-1] != '\\' { - // Good case - quote isn't escaped. - node.Value().SetLen(e - offset - 1) - offset = e + 1 - } else { - // Walk over quotas and look for unescaped one. - for i := e; i < n; { - i = bytealg.IndexByteAtBytes(src, '"', i+1) - if i < 0 { - e = n - 1 - break - } - e = i - if src[e-1] != '\\' { - break - } - } - node.Value().SetLen(e - offset - 1) - offset = e + 1 - } + node.Value().SetLen(e - offset - 1) + offset = e + 1 case isDigit(src[offset]): // Check number node. if offset < n { @@ -165,7 +148,7 @@ func (vec *Vector) parseObject(depth, offset int, node *vector.Node) (int, error offset++ break } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } // Parse key. @@ -178,33 +161,14 @@ func (vec *Vector) parseObject(depth, offset int, node *vector.Node) (int, error child, i := vec.AcquireChildWithType(node, depth, vector.TypeUnknown) // Fill up key's offset and length. child.Key().TakeAddr(src).SetOffset(offset) - e := bytealg.IndexByteAtBytes(src, '"', offset+1) + e := vec.parseKey(src, offset+1) if e < 0 { return n, vector.ErrUnexpEOS } - child.Key().SetBit(flagEscape, false) - if src[e-1] != '\\' { - // Key is an unescaped string, good case. - child.Key().SetLen(e - offset) - offset = e + 1 - } else { - // Key contains escaped bytes. - for i := e; i < n; { - i = bytealg.IndexByteAtBytes(src, '"', i+1) - if i < 0 { - e = n - 1 - break - } - e = i - if src[e-1] != '\\' { - break - } - } - child.Key().SetLen(e - offset) - child.Key().SetBit(flagEscape, true) - offset = e + 1 - } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + child.Key().SetLen(e - offset) + child.Key().SetBit(flagEscape, true) + offset = e + 1 + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } // Check division symbol. @@ -213,7 +177,7 @@ func (vec *Vector) parseObject(depth, offset int, node *vector.Node) (int, error } else { return offset, vector.ErrUnexpId } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } // Parse value. @@ -223,7 +187,7 @@ func (vec *Vector) parseObject(depth, offset int, node *vector.Node) (int, error } // Return updated node to the vector. vec.ReleaseNode(i, child) - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } if src[offset] == '}' { @@ -237,7 +201,7 @@ func (vec *Vector) parseObject(depth, offset int, node *vector.Node) (int, error } else { return offset, vector.ErrUnexpId } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } } @@ -261,7 +225,7 @@ func (vec *Vector) parseArray(depth, offset int, node *vector.Node) (int, error) offset++ break } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } if src[offset] == ']' { @@ -277,7 +241,7 @@ func (vec *Vector) parseArray(depth, offset int, node *vector.Node) (int, error) } // Return updated node to the vector. vec.ReleaseNode(i, child) - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } if src[offset] == ']' { @@ -291,9 +255,31 @@ func (vec *Vector) parseArray(depth, offset int, node *vector.Node) (int, error) } else { return offset, vector.ErrUnexpId } - if offset, eof = bytealg.SkipBytesFmt4(src, offset); eof { + if offset, eof = skipfmt(src, offset); eof { return offset, vector.ErrUnexpEOF } } return offset, nil } + +func (vec *Vector) parseKey(src []byte, offset int) int { + n := len(src) + _ = src[n-1] + mn := imin(n, 8) + for i := offset; i < offset+mn; i++ { + if src[i] == '"' { + return i + } + if src[i] == '\\' { + break + } + } + return indexbyte.IndexAtNE(src, '"', offset) +} + +func imin(a, b int) (r int) { + if r = a; r > b { + r = b + } + return +} diff --git a/skipfmt.go b/skipfmt.go new file mode 100644 index 0000000..e4864a4 --- /dev/null +++ b/skipfmt.go @@ -0,0 +1,10 @@ +package jsonvector + +import "github.com/koykov/bytealg" + +func skipfmt(p []byte, off int) (int, bool) { + if p[off] > 0x20 { + return off, false + } + return bytealg.SkipBytesFmt4(p, off) +} diff --git a/testdata/scalarString1KB.json b/testdata/scalarString1KB.json new file mode 100644 index 0000000..0fbab72 --- /dev/null +++ b/testdata/scalarString1KB.json @@ -0,0 +1 @@ +"womdERXtDhxse6X5wFEBnklbFENuhZZYawAGhu4Uxs9bFoJEEqLi8H9iJpoGP5lhOsYYvBtFemrY8yj7NT6G7Tr4XoXTyqPkylbg7rZURbIeNd8OFt8yyU8yUL25xf5n5bYhsL7OPrYakhAXnWQLZuqd5bg6r4RmZqfHXKGRNRblJvesPEdJkiArkCA5P3Kr3Z5QBUr2mAIhoLnPdKywP9k8NHJpzLuDu0cqyLsKHYE2VeL5fODu9g7eYzZV4Jd6kWM3DELxwy9twcZMbEuIiRE801Lwnlbenc2128zqtIjeVGXdIRdGEtVUzndr2By0BaGsL71mY1KQQkxonSPeHyZ4I7nK4UXg4akXxwvmSIihmHdwHtVGBqOqBcGB3WemoEvnCrNyzTfLgcy5nXurCo6g7sTgvgCy15L39VI3kHhPKkFO2pb8icZToiDzDbp6yz3XT3XOB6wud4ce38uuQjYZeHA21WXRtwR2cV0fXvAFVSSQkrcOJu6gVBtwNB2QromrUj15CaqHLxq2kRlw1Drf3QQ9CVH0A3HJhIgH3JGYCQSk6jpZaArzgwfeofLpqmGhAdPcrga9YGk4UIILOW85DH2kDM7VfVM1xNbJXnhCudVAb5S3XCYlKvQlUWD2mayQ8Blgdn5hILUVnV9146v9SmhJ25IPJZQ5hE7hmyX7kIjsfTHTHWugSeWXQONtYKJ0ZD5f85j0pysywbMMnDLthaPw67dcABUQQAD0D3yCT5wWMTKVWqY6xr3pgoZsTMmxnqCi3E4e88TRUAPpso9Q8zTvJqOrAIWhxVzBDJWHmSamIWLpDmFLv255YrI6fDQyeeF22OwNL6rMmQffJGHxMJAnHoTYyIRAzX1MZb1u2AdV4f0wmw44Eclcxm3iWDt4PBb0AaiidPdOlWia9auH65ptZCmNfjrtQTqW3KzGD5j026QxJSjeFUM3cYrFOr51E7zUxG6Q4Cvw14yBv4GnGL9hHnAu5KgCnMGIS9lAjwVyZfdYWFzSdoSD7O83" diff --git a/unescape.go b/unescape.go index e69950b..58fba8b 100644 --- a/unescape.go +++ b/unescape.go @@ -1,18 +1,17 @@ package jsonvector import ( - "strconv" "unicode/utf16" - "github.com/koykov/bytealg" - "github.com/koykov/byteconv" + "github.com/koykov/simd/indexbyte" ) // Unescape byte array using itself as a destination. func Unescape(p []byte) []byte { - l, i := len(p), 0 + l := len(p) + var i int for { - i = bytealg.IndexByteAtBytes(p, '\\', i) + i = indexbyte.IndexAt(p, '\\', i) if i < 0 || i+1 == l { break } @@ -51,11 +50,7 @@ func Unescape(p []byte) []byte { continue } x := p[i+2 : i+6] - u, err := strconv.ParseUint(byteconv.B2S(x), 16, 16) - if err != nil { - i++ - continue - } + u := xtouTable(x) r := rune(u) if !utf16.IsSurrogate(r) { // Regular utf8 symbol. @@ -76,11 +71,7 @@ func Unescape(p []byte) []byte { continue } else { x = p[i+8 : i+12] - u1, err := strconv.ParseUint(byteconv.B2S(x), 16, 16) - if err != nil { - i++ - continue - } + u1 := xtouTable(x) r = utf16.DecodeRune(r, rune(u1)) s := string(r) z := len(s) diff --git a/vector_test.go b/vector_test.go index 2bc9b9c..dadc410 100644 --- a/vector_test.go +++ b/vector_test.go @@ -223,6 +223,11 @@ func BenchmarkScalar(b *testing.B) { assertNode(b, vec, "", `foo "bar" string`) }) }) + b.Run("scalarString1KB", func(b *testing.B) { + bench(b, func(vec *Vector) { + assertType(b, vec, "", vector.TypeString) + }) + }) b.Run("scalarNumber", func(b *testing.B) { bench(b, func(vec *Vector) { assertType(b, vec, "", vector.TypeNumber) diff --git a/xtou.go b/xtou.go new file mode 100644 index 0000000..ec170b3 --- /dev/null +++ b/xtou.go @@ -0,0 +1,20 @@ +package jsonvector + +var tableXTOU [256]byte + +func init() { + for c := '0'; c <= '9'; c++ { + tableXTOU[c] = uint8(c - '0') + } + for c := 'a'; c <= 'f'; c++ { + tableXTOU[c] = uint8(c - 'a' + 10) + } + for c := 'A'; c <= 'F'; c++ { + tableXTOU[c] = uint8(c - 'A' + 10) + } +} + +func xtouTable(s []byte) uint64 { + _, _ = tableXTOU[255], s[3] + return uint64(tableXTOU[s[0]])<<12 | uint64(tableXTOU[s[1]])<<8 | uint64(tableXTOU[s[2]])<<4 | uint64(tableXTOU[s[3]]) +}