gojay

high performance JSON encoder/decoder with stream API for Golang
git clone git://git.lair.cx/gojay
Log | Files | Refs | README | LICENSE

commit 6e7f13f827dad9657e3e5392ce306e2f35a9dff5
parent aeeed52a7223b973bbd699b71e316db223b0a203
Author: francoispqt <francois@parquet.ninja>
Date:   Mon, 14 May 2018 23:18:47 +0800

add support of unicode parsing

Diffstat:
Mdecode_string.go | 151+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mdecode_string_test.go | 190+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 326 insertions(+), 15 deletions(-)

diff --git a/decode_string.go b/decode_string.go @@ -2,6 +2,8 @@ package gojay import ( "fmt" + "unicode/utf16" + "unicode/utf8" "unsafe" ) @@ -77,18 +79,50 @@ func (dec *Decoder) parseEscapedString() error { dec.length = len(dec.data) dec.cursor -= nSlash - diff return nil + case 'u': + if nSlash&1 == 0 { + diff := nSlash >> 1 + dec.data = append(dec.data[:start+diff-1], dec.data[dec.cursor-1:]...) + dec.length = len(dec.data) + dec.cursor -= nSlash - diff + return nil + } + start := dec.cursor - 2 - ((nSlash - 1) >> 1) + str, err := dec.parseUnicode() + if err != nil { + dec.err = err + return err + } + diff := dec.cursor - start + dec.data = append(append(dec.data[:start], str...), dec.data[dec.cursor:]...) + dec.length = len(dec.data) + dec.cursor = dec.cursor - diff + return nil case 'b': // number of slash must be even // if is odd number of slashes // divide nSlash - 1 by 2 and leave last one // else divide nSlash by 2 and leave the letter + if nSlash&1 != 0 { + return InvalidJSONError("Invalid JSON unescaped character") + } var diff int + diff = nSlash >> 1 + dec.data = append(append(dec.data[:start+diff-2], '\b'), dec.data[dec.cursor:]...) + dec.length = len(dec.data) + dec.cursor -= nSlash - diff + 1 + return nil + case 'f': + // number of slash must be even + // if is odd number of slashes + // divide nSlash - 1 by 2 and leave last one + // else divide nSlash by 2 and leave the letter if nSlash&1 != 0 { return InvalidJSONError("Invalid JSON unescaped character") - } else { - diff = nSlash >> 1 - dec.data = append(append(dec.data[:start+diff-2], '\b'), dec.data[dec.cursor:]...) } + var diff int + diff = nSlash >> 1 + dec.data = append(append(dec.data[:start+diff-2], '\f'), dec.data[dec.cursor:]...) dec.length = len(dec.data) dec.cursor -= nSlash - diff + 1 return nil @@ -97,13 +131,12 @@ func (dec *Decoder) parseEscapedString() error { // if is odd number of slashes // divide nSlash - 1 by 2 and leave last one // else divide nSlash by 2 and leave the letter - var diff int if nSlash&1 != 0 { return InvalidJSONError("Invalid JSON unescaped character") - } else { - diff = nSlash >> 1 - dec.data = append(append(dec.data[:start+diff-2], '\n'), dec.data[dec.cursor:]...) } + var diff int + diff = nSlash >> 1 + dec.data = append(append(dec.data[:start+diff-2], '\n'), dec.data[dec.cursor:]...) dec.length = len(dec.data) dec.cursor -= nSlash - diff + 1 return nil @@ -112,13 +145,12 @@ func (dec *Decoder) parseEscapedString() error { // if is odd number of slashes // divide nSlash - 1 by 2 and leave last one // else divide nSlash by 2 and leave the letter - var diff int if nSlash&1 != 0 { return InvalidJSONError("Invalid JSON unescaped character") - } else { - diff = nSlash >> 1 - dec.data = append(append(dec.data[:start+diff-2], '\r'), dec.data[dec.cursor:]...) } + var diff int + diff = nSlash >> 1 + dec.data = append(append(dec.data[:start+diff-2], '\r'), dec.data[dec.cursor:]...) dec.length = len(dec.data) dec.cursor -= nSlash - diff + 1 return nil @@ -127,13 +159,12 @@ func (dec *Decoder) parseEscapedString() error { // if is odd number of slashes // divide nSlash - 1 by 2 and leave last one // else divide nSlash by 2 and leave the letter - var diff int if nSlash&1 != 0 { return InvalidJSONError("Invalid JSON unescaped character") - } else { - diff = nSlash >> 1 - dec.data = append(append(dec.data[:start+diff-2], '\t'), dec.data[dec.cursor:]...) } + var diff int + diff = nSlash >> 1 + dec.data = append(append(dec.data[:start+diff-2], '\t'), dec.data[dec.cursor:]...) dec.length = len(dec.data) dec.cursor -= nSlash - diff + 1 return nil @@ -227,3 +258,93 @@ func (dec *Decoder) skipString() error { } return InvalidJSONError("Invalid JSON while parsing string") } + +func (dec *Decoder) getUnicode() (rune, error) { + i := 0 + r := rune(0) + for ; (dec.cursor < dec.length || dec.read()) && i < 4; dec.cursor++ { + c := dec.data[dec.cursor] + if c >= '0' && c <= '9' { + r = r*16 + rune(c-'0') + } else if c >= 'a' && c <= 'f' { + r = r*16 + rune(c-'a'+10) + } else if c >= 'A' && c <= 'F' { + r = r*16 + rune(c-'A'+10) + } else { + return 0, InvalidJSONError("Invalid unicode code point") + } + i++ + } + return r, nil +} + +func (dec *Decoder) appendEscapeChar(str []byte, c byte) ([]byte, error) { + switch c { + case 't': + str = append(str, '\t') + case 'n': + str = append(str, '\n') + case 'r': + str = append(str, '\r') + case 'b': + str = append(str, '\b') + case 'f': + str = append(str, '\f') + case '\\': + str = append(str, '\\') + default: + return nil, InvalidJSONError("Invalid JSON") + } + return str, nil +} + +func (dec *Decoder) parseUnicode() ([]byte, error) { + // get unicode after u + r, err := dec.getUnicode() + if err != nil { + return nil, err + } + // no error start making new string + str := make([]byte, 16, 16) + i := 0 + if utf16.IsSurrogate(r) { + if dec.cursor < dec.length || dec.read() { + c := dec.data[dec.cursor] + if c != '\\' { + i += utf8.EncodeRune(str, r) + return str[:i], nil + } + dec.cursor++ + if dec.cursor >= dec.length && !dec.read() { + return nil, InvalidJSONError("Invalid JSON") + } + c = dec.data[dec.cursor] + if c != 'u' { + i += utf8.EncodeRune(str, r) + str, err = dec.appendEscapeChar(str[:i], c) + if err != nil { + dec.err = err + return nil, err + } + i++ + dec.cursor++ + return str[:i], nil + } + dec.cursor++ + r2, err := dec.getUnicode() + if err != nil { + return nil, err + } + combined := utf16.DecodeRune(r, r2) + if combined == '\uFFFD' { + i += utf8.EncodeRune(str, r) + i += utf8.EncodeRune(str, r2) + } else { + i += utf8.EncodeRune(str, combined) + } + } + return str[:i], nil + } + i += utf8.EncodeRune(str, r) + return str[:i], nil +} diff --git a/decode_string_test.go b/decode_string_test.go @@ -10,6 +10,196 @@ import ( "github.com/stretchr/testify/assert" ) +func TestDecoderString(t *testing.T) { + testCases := []struct { + name string + json string + expectedResult string + err bool + errType interface{} + }{ + { + name: "basic-string", + json: `"string"`, + expectedResult: "string", + err: false, + }, + { + name: "basic-string2", + json: `"hello world!"`, + expectedResult: "hello world!", + err: false, + }, + { + name: "escape-control-char", + json: `"\n"`, + expectedResult: "", + err: true, + }, + { + name: "escape-control-char", + json: `"\\n"`, + expectedResult: "\n", + err: false, + }, + { + name: "escape-control-char", + json: `"\t"`, + expectedResult: "", + err: true, + }, + { + name: "escape-control-char", + json: `"\\t"`, + expectedResult: "\t", + err: false, + }, + { + name: "escape-control-char", + json: `"\b"`, + expectedResult: "", + err: true, + }, + { + name: "escape-control-char", + json: `"\\b"`, + expectedResult: "\b", + err: false, + }, + { + name: "escape-control-char", + json: `"\f"`, + expectedResult: "", + err: true, + }, + { + name: "escape-control-char", + json: `"\\f"`, + expectedResult: "\f", + err: false, + }, + { + name: "escape-control-char", + json: `"\r"`, + expectedResult: "", + err: true, + }, + { + name: "escape-control-char", + json: `"\\r"`, + expectedResult: "\r", + err: false, + }, + { + name: "utf8", + json: `"𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿"`, + expectedResult: "𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿", + err: false, + }, + { + name: "utf8-code-point", + json: `"\u06fc"`, + expectedResult: `ۼ`, + err: false, + }, + { + name: "utf8-code-point-escaped", + json: `"\\u2070"`, + expectedResult: `\u2070`, + err: false, + }, + { + name: "utf8-code-point-err", + json: `"\u2Z70"`, + expectedResult: ``, + err: true, + }, + { + name: "utf16-surrogate", + json: `"\uD834\uDD1E"`, + expectedResult: `𝄞`, + err: false, + }, + { + name: "utf16-surrogate", + json: `"\uD834"`, + expectedResult: `�`, + err: false, + }, + { + name: "utf16-surrogate-err", + json: `"\uD834\`, + expectedResult: ``, + err: true, + }, + { + name: "utf16-surrogate-err2", + json: `"\uD834\uDZ1E`, + expectedResult: ``, + err: true, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\t"`, + expectedResult: "�\t", + err: false, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\n"`, + expectedResult: "�\n", + err: false, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\f"`, + expectedResult: "�\f", + err: false, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\b"`, + expectedResult: "�\b", + err: false, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\r"`, + expectedResult: "�\r", + err: false, + }, + { + name: "utf16-surrogate-followed-by-control-char", + json: `"\uD834\h"`, + expectedResult: "", + err: true, + }, + { + name: "null-err", + json: `nall`, + expectedResult: "", + err: true, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + str := "" + dec := NewDecoder(strings.NewReader(testCase.json)) + err := dec.Decode(&str) + if testCase.err { + assert.NotNil(t, err, "err should not be nil") + if testCase.errType != nil { + assert.IsType(t, testCase.errType, err, "err should of the given type") + } + } else { + assert.Nil(t, err, "err should be nil") + } + assert.Equal(t, testCase.expectedResult, str, fmt.Sprintf("'%s' should be equal to expectedResult", str)) + }) + } +} + func TestDecoderStringBasic(t *testing.T) { json := []byte(`"string"`) var v string