commit 6e7f13f827dad9657e3e5392ce306e2f35a9dff5
parent aeeed52a7223b973bbd699b71e316db223b0a203
Author: francoispqt <francois@parquet.ninja>
Date: Mon, 14 May 2018 23:18:47 +0800
add support of unicode parsing
Diffstat:
M | decode_string.go | | | 151 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- |
M | decode_string_test.go | | | 190 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 326 insertions(+), 15 deletions(-)
diff --git a/decode_string.go b/decode_string.go
@@ -2,6 +2,8 @@ package gojay
import (
"fmt"
+ "unicode/utf16"
+ "unicode/utf8"
"unsafe"
)
@@ -77,18 +79,50 @@ func (dec *Decoder) parseEscapedString() error {
dec.length = len(dec.data)
dec.cursor -= nSlash - diff
return nil
+ case 'u':
+ if nSlash&1 == 0 {
+ diff := nSlash >> 1
+ dec.data = append(dec.data[:start+diff-1], dec.data[dec.cursor-1:]...)
+ dec.length = len(dec.data)
+ dec.cursor -= nSlash - diff
+ return nil
+ }
+ start := dec.cursor - 2 - ((nSlash - 1) >> 1)
+ str, err := dec.parseUnicode()
+ if err != nil {
+ dec.err = err
+ return err
+ }
+ diff := dec.cursor - start
+ dec.data = append(append(dec.data[:start], str...), dec.data[dec.cursor:]...)
+ dec.length = len(dec.data)
+ dec.cursor = dec.cursor - diff
+ return nil
case 'b':
// number of slash must be even
// if is odd number of slashes
// divide nSlash - 1 by 2 and leave last one
// else divide nSlash by 2 and leave the letter
+ if nSlash&1 != 0 {
+ return InvalidJSONError("Invalid JSON unescaped character")
+ }
var diff int
+ diff = nSlash >> 1
+ dec.data = append(append(dec.data[:start+diff-2], '\b'), dec.data[dec.cursor:]...)
+ dec.length = len(dec.data)
+ dec.cursor -= nSlash - diff + 1
+ return nil
+ case 'f':
+ // number of slash must be even
+ // if is odd number of slashes
+ // divide nSlash - 1 by 2 and leave last one
+ // else divide nSlash by 2 and leave the letter
if nSlash&1 != 0 {
return InvalidJSONError("Invalid JSON unescaped character")
- } else {
- diff = nSlash >> 1
- dec.data = append(append(dec.data[:start+diff-2], '\b'), dec.data[dec.cursor:]...)
}
+ var diff int
+ diff = nSlash >> 1
+ dec.data = append(append(dec.data[:start+diff-2], '\f'), dec.data[dec.cursor:]...)
dec.length = len(dec.data)
dec.cursor -= nSlash - diff + 1
return nil
@@ -97,13 +131,12 @@ func (dec *Decoder) parseEscapedString() error {
// if is odd number of slashes
// divide nSlash - 1 by 2 and leave last one
// else divide nSlash by 2 and leave the letter
- var diff int
if nSlash&1 != 0 {
return InvalidJSONError("Invalid JSON unescaped character")
- } else {
- diff = nSlash >> 1
- dec.data = append(append(dec.data[:start+diff-2], '\n'), dec.data[dec.cursor:]...)
}
+ var diff int
+ diff = nSlash >> 1
+ dec.data = append(append(dec.data[:start+diff-2], '\n'), dec.data[dec.cursor:]...)
dec.length = len(dec.data)
dec.cursor -= nSlash - diff + 1
return nil
@@ -112,13 +145,12 @@ func (dec *Decoder) parseEscapedString() error {
// if is odd number of slashes
// divide nSlash - 1 by 2 and leave last one
// else divide nSlash by 2 and leave the letter
- var diff int
if nSlash&1 != 0 {
return InvalidJSONError("Invalid JSON unescaped character")
- } else {
- diff = nSlash >> 1
- dec.data = append(append(dec.data[:start+diff-2], '\r'), dec.data[dec.cursor:]...)
}
+ var diff int
+ diff = nSlash >> 1
+ dec.data = append(append(dec.data[:start+diff-2], '\r'), dec.data[dec.cursor:]...)
dec.length = len(dec.data)
dec.cursor -= nSlash - diff + 1
return nil
@@ -127,13 +159,12 @@ func (dec *Decoder) parseEscapedString() error {
// if is odd number of slashes
// divide nSlash - 1 by 2 and leave last one
// else divide nSlash by 2 and leave the letter
- var diff int
if nSlash&1 != 0 {
return InvalidJSONError("Invalid JSON unescaped character")
- } else {
- diff = nSlash >> 1
- dec.data = append(append(dec.data[:start+diff-2], '\t'), dec.data[dec.cursor:]...)
}
+ var diff int
+ diff = nSlash >> 1
+ dec.data = append(append(dec.data[:start+diff-2], '\t'), dec.data[dec.cursor:]...)
dec.length = len(dec.data)
dec.cursor -= nSlash - diff + 1
return nil
@@ -227,3 +258,93 @@ func (dec *Decoder) skipString() error {
}
return InvalidJSONError("Invalid JSON while parsing string")
}
+
+func (dec *Decoder) getUnicode() (rune, error) {
+ i := 0
+ r := rune(0)
+ for ; (dec.cursor < dec.length || dec.read()) && i < 4; dec.cursor++ {
+ c := dec.data[dec.cursor]
+ if c >= '0' && c <= '9' {
+ r = r*16 + rune(c-'0')
+ } else if c >= 'a' && c <= 'f' {
+ r = r*16 + rune(c-'a'+10)
+ } else if c >= 'A' && c <= 'F' {
+ r = r*16 + rune(c-'A'+10)
+ } else {
+ return 0, InvalidJSONError("Invalid unicode code point")
+ }
+ i++
+ }
+ return r, nil
+}
+
+func (dec *Decoder) appendEscapeChar(str []byte, c byte) ([]byte, error) {
+ switch c {
+ case 't':
+ str = append(str, '\t')
+ case 'n':
+ str = append(str, '\n')
+ case 'r':
+ str = append(str, '\r')
+ case 'b':
+ str = append(str, '\b')
+ case 'f':
+ str = append(str, '\f')
+ case '\\':
+ str = append(str, '\\')
+ default:
+ return nil, InvalidJSONError("Invalid JSON")
+ }
+ return str, nil
+}
+
+func (dec *Decoder) parseUnicode() ([]byte, error) {
+ // get unicode after u
+ r, err := dec.getUnicode()
+ if err != nil {
+ return nil, err
+ }
+ // no error start making new string
+ str := make([]byte, 16, 16)
+ i := 0
+ if utf16.IsSurrogate(r) {
+ if dec.cursor < dec.length || dec.read() {
+ c := dec.data[dec.cursor]
+ if c != '\\' {
+ i += utf8.EncodeRune(str, r)
+ return str[:i], nil
+ }
+ dec.cursor++
+ if dec.cursor >= dec.length && !dec.read() {
+ return nil, InvalidJSONError("Invalid JSON")
+ }
+ c = dec.data[dec.cursor]
+ if c != 'u' {
+ i += utf8.EncodeRune(str, r)
+ str, err = dec.appendEscapeChar(str[:i], c)
+ if err != nil {
+ dec.err = err
+ return nil, err
+ }
+ i++
+ dec.cursor++
+ return str[:i], nil
+ }
+ dec.cursor++
+ r2, err := dec.getUnicode()
+ if err != nil {
+ return nil, err
+ }
+ combined := utf16.DecodeRune(r, r2)
+ if combined == '\uFFFD' {
+ i += utf8.EncodeRune(str, r)
+ i += utf8.EncodeRune(str, r2)
+ } else {
+ i += utf8.EncodeRune(str, combined)
+ }
+ }
+ return str[:i], nil
+ }
+ i += utf8.EncodeRune(str, r)
+ return str[:i], nil
+}
diff --git a/decode_string_test.go b/decode_string_test.go
@@ -10,6 +10,196 @@ import (
"github.com/stretchr/testify/assert"
)
+func TestDecoderString(t *testing.T) {
+ testCases := []struct {
+ name string
+ json string
+ expectedResult string
+ err bool
+ errType interface{}
+ }{
+ {
+ name: "basic-string",
+ json: `"string"`,
+ expectedResult: "string",
+ err: false,
+ },
+ {
+ name: "basic-string2",
+ json: `"hello world!"`,
+ expectedResult: "hello world!",
+ err: false,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\n"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\\n"`,
+ expectedResult: "\n",
+ err: false,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\t"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\\t"`,
+ expectedResult: "\t",
+ err: false,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\b"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\\b"`,
+ expectedResult: "\b",
+ err: false,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\f"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\\f"`,
+ expectedResult: "\f",
+ err: false,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\r"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "escape-control-char",
+ json: `"\\r"`,
+ expectedResult: "\r",
+ err: false,
+ },
+ {
+ name: "utf8",
+ json: `"𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿"`,
+ expectedResult: "𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿",
+ err: false,
+ },
+ {
+ name: "utf8-code-point",
+ json: `"\u06fc"`,
+ expectedResult: `ۼ`,
+ err: false,
+ },
+ {
+ name: "utf8-code-point-escaped",
+ json: `"\\u2070"`,
+ expectedResult: `\u2070`,
+ err: false,
+ },
+ {
+ name: "utf8-code-point-err",
+ json: `"\u2Z70"`,
+ expectedResult: ``,
+ err: true,
+ },
+ {
+ name: "utf16-surrogate",
+ json: `"\uD834\uDD1E"`,
+ expectedResult: `𝄞`,
+ err: false,
+ },
+ {
+ name: "utf16-surrogate",
+ json: `"\uD834"`,
+ expectedResult: `�`,
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-err",
+ json: `"\uD834\`,
+ expectedResult: ``,
+ err: true,
+ },
+ {
+ name: "utf16-surrogate-err2",
+ json: `"\uD834\uDZ1E`,
+ expectedResult: ``,
+ err: true,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\t"`,
+ expectedResult: "�\t",
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\n"`,
+ expectedResult: "�\n",
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\f"`,
+ expectedResult: "�\f",
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\b"`,
+ expectedResult: "�\b",
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\r"`,
+ expectedResult: "�\r",
+ err: false,
+ },
+ {
+ name: "utf16-surrogate-followed-by-control-char",
+ json: `"\uD834\h"`,
+ expectedResult: "",
+ err: true,
+ },
+ {
+ name: "null-err",
+ json: `nall`,
+ expectedResult: "",
+ err: true,
+ },
+ }
+
+ for _, testCase := range testCases {
+ t.Run(testCase.name, func(t *testing.T) {
+ str := ""
+ dec := NewDecoder(strings.NewReader(testCase.json))
+ err := dec.Decode(&str)
+ if testCase.err {
+ assert.NotNil(t, err, "err should not be nil")
+ if testCase.errType != nil {
+ assert.IsType(t, testCase.errType, err, "err should of the given type")
+ }
+ } else {
+ assert.Nil(t, err, "err should be nil")
+ }
+ assert.Equal(t, testCase.expectedResult, str, fmt.Sprintf("'%s' should be equal to expectedResult", str))
+ })
+ }
+}
+
func TestDecoderStringBasic(t *testing.T) {
json := []byte(`"string"`)
var v string