diff --git a/test/tool/net/jsontestsuite_okay_test.lua b/test/tool/net/jsontestsuite_okay_test.lua index 6446a0a09..0de2f8d4c 100644 --- a/test/tool/net/jsontestsuite_okay_test.lua +++ b/test/tool/net/jsontestsuite_okay_test.lua @@ -64,107 +64,95 @@ assert(not DecodeJson([[ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]])) -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_utf16LE_no_BOM.json -assert(not DecodeJson(" [\x00\"\x00\xe9\x00\"\x00]\x00 ")) +val, err = DecodeJson(" [\x00\"\x00\xe9\x00\"\x00]\x00 ") +assert(val == nil) +assert(err == 'illegal character') -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_utf16BE_no_BOM.json assert(not DecodeJson(" \x00[\x00\"\x00\xe9\x00\"\x00] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_truncated-utf-8.json -assert(DecodeJson(" [\"\xe0\xff\"] ")) +val, err = DecodeJson(" [\"\xe0\xff\"] ") +assert(val == nil) +assert(err == 'malformed utf-8') --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_overlong_sequence_6_bytes_null.json -assert(DecodeJson(" [\"\xfc\x80\x80\x80\x80\x80\"] ")) +val, err = DecodeJson(" [\"\xfc\x80\x80\x80\x80\x80\"] ") +assert(val == nil) +assert(err == 'illegal utf-8 character') --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_overlong_sequence_6_bytes.json -assert(DecodeJson(" [\"\xfc\x83\xbf\xbf\xbf\xbf\"] ")) +val, err = DecodeJson(" [\"\xfc\x83\xbf\xbf\xbf\xbf\"] ") +assert(val == nil) +assert(err == "illegal utf-8 character") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_overlong_sequence_2_bytes.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\xc0\xaf\x22\x5d ')) +val, err = DecodeJson(" [\"\xc0\xaf\"] ") +assert(val == nil) +assert(err == "overlong ascii") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_not_in_unicode_range.json --- (converted to binary for safety) -assert(DecodeJson(" [\"\xf4\xbf\xbf\xbf\"] ")) +val, err = DecodeJson(" [\"\xf4\xbf\xbf\xbf\"] ") +assert(val == nil) +assert(err == "utf-8 exceeds utf-16 range") -- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_lone_utf8_continuation_byte.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x81\x22\x5d ')) +val, err = DecodeJson(" [\"\x81\"] ") +assert(val == nil) +assert(err == "c1 control code in string") -- [jart] our behavior here is consistent with v8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_lone_second_surrogate.json assert(DecodeJson(" [\"\\uDFAA\"] ")) assert(EncodeJson(DecodeJson(" [\"\\uDFAA\"] ")) == "[\"\\\\uDFAA\"]") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_iso_latin_1.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\xe9\x22\x5d ')) +val, err = DecodeJson(" [\"\xe9\"] ") +assert(val == nil) +assert(err == "malformed utf-8") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_inverted_surrogates_U+1D11E.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x64\x31\x65\x5c\x75\x44\x38\x33\x34\x22\x5d ')) +assert(DecodeJson(" [\"\\uDd1e\\uD834\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_invalid_utf-8.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\xff\x22\x5d ')) +val, err = DecodeJson(" [\"\xff\"] ") +assert(val == nil) +assert(err == "illegal utf-8 character") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_invalid_surrogate.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x64\x38\x30\x30\x61\x62\x63\x22\x5d ')) +assert(EncodeJson(DecodeJson(" [\"\\ud800abc\"] ")) == "[\"\\\\ud800abc\"]") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_invalid_lonely_surrogate.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x64\x38\x30\x30\x22\x5d ')) +assert(DecodeJson(" [\"\\ud800\"] ")[1] == "\\ud800") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_incomplete_surrogates_escape_valid.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x38\x30\x30\x5c\x75\x44\x38\x30\x30\x5c\x6e\x22\x5d ')) +assert(DecodeJson(" [\"\\uD800\\uD800\\n\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_incomplete_surrogate_pair.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x64\x31\x65\x61\x22\x5d ')) +assert(DecodeJson(" [\"\\uDd1ea\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_incomplete_surrogate_and_escape_valid.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x38\x30\x30\x5c\x6e\x22\x5d ')) +assert(DecodeJson(" [\"\\uD800\\n\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_UTF8_surrogate_U+D800.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\xed\xa0\x80\x22\x5d ')) +assert(DecodeJson(" [\"\\ud800abc\"] ")[1] == "\\ud800abc") --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_UTF-8_invalid_sequence.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\xe6\x97\xa5\xd1\x88\xfa\x22\x5d ')) +val, err = DecodeJson(" [\"\xe6\x97\xa5\xd1\x88\xfa\"] ") +assert(val == nil) +assert(err == "illegal utf-8 character") -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_UTF-16LE_with_BOM.json assert(not DecodeJson(" \xff\xfe[\x00\"\x00\xe9\x00\"\x00]\x00 ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_1st_valid_surrogate_2nd_invalid.json --- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x38\x38\x38\x5c\x75\x31\x32\x33\x34\x22\x5d ')) +assert(DecodeJson(" [\"\\uD888\\u1234\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_string_1st_surrogate_but_2nd_missing.json -- (converted to binary for safety) -assert(DecodeJson(' \x5b\x22\x5c\x75\x44\x41\x44\x41\x22\x5d ')) +assert(DecodeJson(" [\"\\uDADA\"] ")) --- [jart] ljson currently doesn't validate utf-8 -- https://github.com/nst/JSONTestSuite/tree/d64aefb55228d9584d3e5b2433f720ea8fd00c82/test_parsing/i_object_key_lone_2nd_surrogate.json -- (converted to binary for safety) assert(DecodeJson(' \x7b\x22\x5c\x75\x44\x46\x41\x41\x22\x3a\x30\x7d ')) diff --git a/test/tool/net/ljson_test.lua b/test/tool/net/ljson_test.lua index dc0873db8..2b0382534 100644 --- a/test/tool/net/ljson_test.lua +++ b/test/tool/net/ljson_test.lua @@ -27,6 +27,8 @@ assert(EncodeLua(assert(DecodeJson[[ [{"heh": [1,3,2]}] ]])) == '{{heh={1, 3, 2} assert(EncodeLua(assert(DecodeJson[[ 3.14159 ]])) == '3.14159') assert(EncodeLua(assert(DecodeJson[[ 1e-12 ]])) == '1e-12') assert(assert(DecodeJson[[ "\u007f" ]]) == '\x7f') +assert(assert(DecodeJson[[ "๐€ ๐ ๐‚" ]]) == "๐€ ๐ ๐‚") +assert(assert(DecodeJson[[ "๐Ÿ˜€ ๐Ÿ˜ ๐Ÿ˜‚" ]]) == "๐Ÿ˜€ ๐Ÿ˜ ๐Ÿ˜‚") assert(EncodeJson(assert(DecodeJson[[ 1e-12 ]])) == '1e-12') assert(EncodeJson(assert(DecodeJson[[ true ]])) == 'true') @@ -42,6 +44,9 @@ assert(assert(DecodeJson[["\""]]) == '\"') -- c0 assert(assert(DecodeJson[["\u0100"]]) == 'ฤ€') -- latin-1 assert(assert(DecodeJson[["\ud800\udf30\ud800\udf30"]]) == '๐Œฐ๐Œฐ') -- utf-16 astral planes gothic assert(assert(DecodeJson[["\uD800"]]) == '\\uD800') -- utf-16 invalid (keep utf-8 well-formed) +assert(not DecodeJson('"\xc0\x80"')) +assert(not DecodeJson('"\xc1\x80"')) +assert(DecodeJson('"\xc2\x80"')) assert(EncodeJson(assert(DecodeJson[[ -9223372036854775808 ]])) == '-9223372036854775808') -- minimum 64-bit integer assert(EncodeJson(assert(DecodeJson[[ 9223372036854775807 ]])) == '9223372036854775807') -- maximum 64-bit integer @@ -97,7 +102,7 @@ assert(err == "object key must be string") res, err = DecodeJson('"\x00"') assert(res == nil) -assert(err == 'non-del c0 in string') +assert(err == 'non-del c0 control code in string') res, err = DecodeJson('"e') assert(res == nil) @@ -151,6 +156,7 @@ assert(err == "maximum depth exceeded") -- JsonEncodeInts 498 1543 -- JsonEncodeFloats 498 1543 -- JsonEncodeObject 1333 4129 +-- BigString 3183 9855 function JsonParseEmpty() DecodeJson[[]] @@ -168,6 +174,13 @@ function JsonParseString() DecodeJson[[ "\ud800\udf30 he๐Œฐ๐Œฐo \ud800\udf30" ]] end +function BigString() + assert(DecodeJson[[ + ["The fall of Hyperion - a DreamJohn KeatsCANTO I๐˜๐˜ข๐˜ฏ๐˜ข๐˜ต๐˜ช๐˜ค๐˜ด ๐˜ฉ๐˜ข๐˜ท๐˜ฆ ๐˜ต๐˜ฉ๐˜ฆ๐˜ช๐˜ณ dreams, ๐˜„๐—ต๐—ฒ๐—ฟ๐—ฒ๐˜„๐—ถ๐˜๐—ต ๐˜๐—ต๐—ฒ๐˜† ๐˜„๐—ฒ๐—ฎ๐˜ƒ๐—ฒA paradise for a sect; the savage tooFrom forth the loftiest fashion of his sleepGuesses at Heaven; pity these have notTrac'd upon vellum or wild Indian leafThe shadows of melodious utterance.But bare of laurel they live, dream, and die;For Poesy alone can tell her dreams,With the fine spell of words alone can saveImagination from the sable charmAnd dumb enchantment. Who alive can say,'Thou art no Poet may'st not tell thy dreams?'Since every man whose soul is not a clodHath visions, and would speak, if he had lovedAnd been well nurtured in his mother tongue.Whether the dream now purpos'd to rehearseBe poet's or fanatic's will be knownWhen this warm scribe my hand is in the grave.", + "The fall of Hyperion - a DreamJohn KeatsCANTO I๐˜๐˜ข๐˜ฏ๐˜ข๐˜ต๐˜ช๐˜ค๐˜ด ๐˜ฉ๐˜ข๐˜ท๐˜ฆ ๐˜ต๐˜ฉ๐˜ฆ๐˜ช๐˜ณ dreams, ๐˜„๐—ต๐—ฒ๐—ฟ๐—ฒ๐˜„๐—ถ๐˜๐—ต ๐˜๐—ต๐—ฒ๐˜† ๐˜„๐—ฒ๐—ฎ๐˜ƒ๐—ฒA paradise for a sect; the savage tooFrom forth the loftiest fashion of his sleepGuesses at Heaven; pity these have notTrac'd upon vellum or wild Indian leafThe shadows of melodious utterance.But bare of laurel they live, dream, and die;For Poesy alone can tell her dreams,With the fine spell of words alone can saveImagination from the sable charmAnd dumb enchantment. Who alive can say,'Thou art no Poet may'st not tell thy dreams?'Since every man whose soul is not a clodHath visions, and would speak, if he had lovedAnd been well nurtured in his mother tongue.Whether the dream now purpos'd to rehearseBe poet's or fanatic's will be knownWhen this warm scribe my hand is in the grave."] + ]]) +end + function JsonParseInts() DecodeJson[[ [123,456,789] ]] end @@ -192,7 +205,7 @@ function JsonEncodeObject() EncodeJson({["3"]="1", ["4"]="1", ["5"]={["3"]="1", ["4"]="1", ["5"]="9"}}) end -if nil then +function bench() print('JsonParseEmpty', Benchmark(JsonParseEmpty)) print('JsonParseInteg', Benchmark(JsonParseInteger)) print('JsonParseDouble', Benchmark(JsonParseDouble)) @@ -203,4 +216,7 @@ if nil then print('JsonEncodeInts', Benchmark(JsonEncodeInts)) print('JsonEncodeFlts', Benchmark(JsonEncodeFloats)) print('JsonEncodeObj', Benchmark(JsonEncodeObject)) + print('BigString', Benchmark(BigString)) end + +bench() diff --git a/tool/net/help.txt b/tool/net/help.txt index 8430a92b9..fe31cc59c 100644 --- a/tool/net/help.txt +++ b/tool/net/help.txt @@ -713,24 +713,14 @@ FUNCTIONS coerce it to `null` since that's what v8 does, and the same goes for underflows which, like v8, are coerced to 0.0. - This parser does not validate UTF-8 which is copied how the - JSON specifies. It may therefore contain underlong overlong - characters, trojan source and even numbers banned the IETF. - You can use VisualizeControlCodes() and Underlong(), to see - if a string round-trips, to detect these weirdo codepoints. - - This parser does some validation of UTF-16. Consistent with - v8, bad surrogate characters will be silently preserved, as - their original escape sequence text. Thereby ensuring utf-8 - output is valid. Please note that invalid utf-8 could still - happen if it's encoded as utf-8. - When objects are parsed, your Lua object can't preserve the the original ordering of fields. As such, they'll be sorted by EncodeJson() and may not round-trip with original intent This parser has perfect conformance with JSONTestSuite. + This parser validates utf-8 and utf-16. + EncodeJson(value[, options:table]) โ”œโ”€โ†’ json:str โ”œโ”€โ†’ true [if useoutput] diff --git a/tool/net/ljson.c b/tool/net/ljson.c index cddbbbee1..3dd5e68fe 100644 --- a/tool/net/ljson.c +++ b/tool/net/ljson.c @@ -39,6 +39,55 @@ #define OBJECT 16 #define DEPTH 64 +#define ASCII 0 +#define C0 1 +#define DQUOTE 2 +#define BACKSLASH 3 +#define UTF8_2 4 +#define UTF8_3 5 +#define UTF8_4 6 +#define C1 7 +#define UTF8_3_E0 8 +#define UTF8_3_ED 9 +#define UTF8_4_F0 10 +#define BADUTF8 11 +#define EVILUTF8 12 + +static const char kJsonStr[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, // 0000 ascii (0) + 1, 1, 1, 1, 1, 1, 1, 1, // 0010 + 1, 1, 1, 1, 1, 1, 1, 1, // 0020 c0 (1) + 1, 1, 1, 1, 1, 1, 1, 1, // 0030 + 0, 0, 2, 0, 0, 0, 0, 0, // 0040 dquote (2) + 0, 0, 0, 0, 0, 0, 0, 0, // 0050 + 0, 0, 0, 0, 0, 0, 0, 0, // 0060 + 0, 0, 0, 0, 0, 0, 0, 0, // 0070 + 0, 0, 0, 0, 0, 0, 0, 0, // 0100 + 0, 0, 0, 0, 0, 0, 0, 0, // 0110 + 0, 0, 0, 0, 0, 0, 0, 0, // 0120 + 0, 0, 0, 0, 3, 0, 0, 0, // 0130 backslash (3) + 0, 0, 0, 0, 0, 0, 0, 0, // 0140 + 0, 0, 0, 0, 0, 0, 0, 0, // 0150 + 0, 0, 0, 0, 0, 0, 0, 0, // 0160 + 0, 0, 0, 0, 0, 0, 0, 0, // 0170 + 7, 7, 7, 7, 7, 7, 7, 7, // 0200 c1 (8) + 7, 7, 7, 7, 7, 7, 7, 7, // 0210 + 7, 7, 7, 7, 7, 7, 7, 7, // 0220 + 7, 7, 7, 7, 7, 7, 7, 7, // 0230 + 11, 11, 11, 11, 11, 11, 11, 11, // 0240 latin1 (4) + 11, 11, 11, 11, 11, 11, 11, 11, // 0250 + 11, 11, 11, 11, 11, 11, 11, 11, // 0260 + 11, 11, 11, 11, 11, 11, 11, 11, // 0270 + 12, 12, 4, 4, 4, 4, 4, 4, // 0300 utf8-2 (5) + 4, 4, 4, 4, 4, 4, 4, 4, // 0310 + 4, 4, 4, 4, 4, 4, 4, 4, // 0320 utf8-2 + 4, 4, 4, 4, 4, 4, 4, 4, // 0330 + 8, 5, 5, 5, 5, 5, 5, 5, // 0340 utf8-3 (6) + 5, 5, 5, 5, 5, 9, 5, 5, // 0350 + 10, 6, 6, 6, 6, 11, 11, 11, // 0360 utf8-4 (7) + 11, 11, 11, 11, 11, 11, 11, 11, // 0370 +}; + static struct DecodeJson Parse(struct lua_State *L, const char *p, const char *e, int context, int depth) { long x; @@ -251,129 +300,251 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p, reason = "unexpected eof in string"; goto StringFailureWithReason; } - c = *p++ & 255; - if (c == '"') { - luaL_pushresult(&b); - return (struct DecodeJson){1, p}; - } else if (c == '\\') { - goto HandleEscape; - } else if (UNLIKELY(c <= 0x1F)) { - reason = "non-del c0 in string"; - goto StringFailureWithReason; - } else { - luaL_addchar(&b, c); - } - continue; - HandleEscape: - if (p >= e) { - goto UnexpectedEofString; - } - switch ((c = *p++ & 255)) { - case '"': - case '/': - case '\\': + switch (kJsonStr[(c = *p++ & 255)]) { + + case ASCII: luaL_addchar(&b, c); break; - case 'b': - luaL_addchar(&b, '\b'); - break; - case 'f': - luaL_addchar(&b, '\f'); - break; - case 'n': - luaL_addchar(&b, '\n'); - break; - case 'r': - luaL_addchar(&b, '\r'); - break; - case 't': - luaL_addchar(&b, '\t'); - break; - case 'x': - if (p + 2 <= e && // - (A = kHexToInt[p[0] & 255]) != -1 && // HEX - (B = kHexToInt[p[1] & 255]) != -1) { // - c = A << 4 | B; - if (!(0x20 <= c && c <= 0x7E)) { - reason = "hex escape not printable"; - goto StringFailureWithReason; - } - p += 2; - luaL_addchar(&b, c); - break; - } else { - reason = "invalid hex escape"; - goto StringFailureWithReason; + + case DQUOTE: + luaL_pushresult(&b); + return (struct DecodeJson){1, p}; + + case BACKSLASH: + if (p >= e) { + goto UnexpectedEofString; } - case 'u': - if (p + 4 <= e && // - (A = kHexToInt[p[0] & 255]) != -1 && // - (B = kHexToInt[p[1] & 255]) != -1 && // UCS-2 - (C = kHexToInt[p[2] & 255]) != -1 && // - (D = kHexToInt[p[3] & 255]) != -1) { // - c = A << 12 | B << 8 | C << 4 | D; - if (!IsSurrogate(c)) { - p += 4; - } else if (IsHighSurrogate(c)) { - if (p + 4 + 6 <= e && // - p[4] == '\\' && // - p[5] == 'u' && // - (A = kHexToInt[p[6] & 255]) != -1 && // UTF-16 - (B = kHexToInt[p[7] & 255]) != -1 && // - (C = kHexToInt[p[8] & 255]) != -1 && // - (D = kHexToInt[p[9] & 255]) != -1) { // - u = A << 12 | B << 8 | C << 4 | D; - if (IsLowSurrogate(u)) { - p += 4 + 6; - c = MergeUtf16(c, u); + switch ((c = *p++ & 255)) { + case '"': + case '/': + case '\\': + luaL_addchar(&b, c); + break; + case 'b': + luaL_addchar(&b, '\b'); + break; + case 'f': + luaL_addchar(&b, '\f'); + break; + case 'n': + luaL_addchar(&b, '\n'); + break; + case 'r': + luaL_addchar(&b, '\r'); + break; + case 't': + luaL_addchar(&b, '\t'); + break; + case 'x': + if (p + 2 <= e && // + (A = kHexToInt[p[0] & 255]) != -1 && // HEX + (B = kHexToInt[p[1] & 255]) != -1) { // + c = A << 4 | B; + if (!(0x20 <= c && c <= 0x7E)) { + reason = "hex escape not printable"; + goto StringFailureWithReason; + } + p += 2; + luaL_addchar(&b, c); + break; + } else { + reason = "invalid hex escape"; + goto StringFailureWithReason; + } + case 'u': + if (p + 4 <= e && // + (A = kHexToInt[p[0] & 255]) != -1 && // + (B = kHexToInt[p[1] & 255]) != -1 && // UCS-2 + (C = kHexToInt[p[2] & 255]) != -1 && // + (D = kHexToInt[p[3] & 255]) != -1) { // + c = A << 12 | B << 8 | C << 4 | D; + if (!IsSurrogate(c)) { + p += 4; + } else if (IsHighSurrogate(c)) { + if (p + 4 + 6 <= e && // + p[4] == '\\' && // + p[5] == 'u' && // + (A = kHexToInt[p[6] & 255]) != -1 && // UTF-16 + (B = kHexToInt[p[7] & 255]) != -1 && // + (C = kHexToInt[p[8] & 255]) != -1 && // + (D = kHexToInt[p[9] & 255]) != -1) { // + u = A << 12 | B << 8 | C << 4 | D; + if (IsLowSurrogate(u)) { + p += 4 + 6; + c = MergeUtf16(c, u); + } else { + goto BadUnicode; + } + } else { + goto BadUnicode; + } } else { goto BadUnicode; } + // UTF-8 + EncodeUtf8: + if (c <= 0x7f) { + w[0] = c; + i = 1; + } else if (c <= 0x7ff) { + w[0] = 0300 | (c >> 6); + w[1] = 0200 | (c & 077); + i = 2; + } else if (c <= 0xffff) { + if (IsSurrogate(c)) { + ReplacementCharacter: + c = 0xfffd; + } + w[0] = 0340 | (c >> 12); + w[1] = 0200 | ((c >> 6) & 077); + w[2] = 0200 | (c & 077); + i = 3; + } else if (~(c >> 18) & 007) { + w[0] = 0360 | (c >> 18); + w[1] = 0200 | ((c >> 12) & 077); + w[2] = 0200 | ((c >> 6) & 077); + w[3] = 0200 | (c & 077); + i = 4; + } else { + goto ReplacementCharacter; + } + luaL_addlstring(&b, w, i); } else { - goto BadUnicode; + reason = "invalid unicode escape"; + goto StringFailureWithReason; + BadUnicode: + // Echo invalid \uXXXX sequences + // Rather than corrupting UTF-8! + luaL_addstring(&b, "\\u"); } - } else { - goto BadUnicode; - } - // UTF-8 - if (c <= 0x7f) { - w[0] = c; - i = 1; - } else if (c <= 0x7ff) { - w[0] = 0300 | (c >> 6); - w[1] = 0200 | (c & 077); - i = 2; - } else if (c <= 0xffff) { - if (IsSurrogate(c)) { - ReplacementCharacter: - c = 0xfffd; - } - w[0] = 0340 | (c >> 12); - w[1] = 0200 | ((c >> 6) & 077); - w[2] = 0200 | (c & 077); - i = 3; - } else if (~(c >> 18) & 007) { - w[0] = 0360 | (c >> 18); - w[1] = 0200 | ((c >> 12) & 077); - w[2] = 0200 | ((c >> 6) & 077); - w[3] = 0200 | (c & 077); - i = 4; - } else { - goto ReplacementCharacter; - } - luaL_addlstring(&b, w, i); - } else { - reason = "invalid unicode escape"; - goto StringFailureWithReason; - BadUnicode: - // Echo invalid \uXXXX sequences - // Rather than corrupting UTF-8! - luaL_addstring(&b, "\\u"); + break; + default: + reason = "invalid escape character"; + goto StringFailureWithReason; } break; - default: - reason = "invalid escape character"; + + case UTF8_2: + if (p < e && // + (p[0] & 0300) == 0200) { // + c = (c & 037) << 6 | // + (p[0] & 077); // + p += 1; + goto EncodeUtf8; + } else { + reason = "malformed utf-8"; + goto StringFailureWithReason; + } + + case UTF8_3_E0: + if (p + 2 <= e && // + (p[0] & 0377) < 0240 && // + (p[0] & 0300) == 0200 && // + (p[1] & 0300) == 0200) { + reason = "overlong utf-8 0..0x7ff"; + goto StringFailureWithReason; + } + // fallthrough + case UTF8_3: + ThreeUtf8: + if (p + 2 <= e && // + (p[0] & 0300) == 0200 && // + (p[1] & 0300) == 0200) { // + c = (c & 017) << 12 | // + (p[0] & 077) << 6 | // + (p[1] & 077); // + p += 2; + goto EncodeUtf8; + } else { + reason = "malformed utf-8"; + goto StringFailureWithReason; + } + + case UTF8_3_ED: + if (p + 2 <= e && // + (p[0] & 0377) >= 0240) { // + if (p + 5 <= e && // + (p[0] & 0377) >= 0256 && // + (p[1] & 0300) == 0200 && // + (p[2] & 0377) == 0355 && // + (p[3] & 0377) >= 0260 && // + (p[4] & 0300) == 0200) { // + A = (0355 & 017) << 12 | // CESU-8 + (p[0] & 077) << 6 | // + (p[1] & 077); // + B = (0355 & 017) << 12 | // + (p[3] & 077) << 6 | // + (p[4] & 077); // + c = ((A - 0xDB80) << 10) + // + ((B - 0xDC00) + 0x10000); // + goto EncodeUtf8; + } else if ((p[0] & 0300) == 0200 && // + (p[1] & 0300) == 0200) { // + reason = "utf-16 surrogate in utf-8"; + goto StringFailureWithReason; + } else { + reason = "malformed utf-8"; + goto StringFailureWithReason; + } + } + goto ThreeUtf8; + + case UTF8_4_F0: + if (p + 3 <= e && (p[0] & 0377) < 0220 && + (((uint32_t)(p[+2] & 0377) << 030 | + (uint32_t)(p[+1] & 0377) << 020 | + (uint32_t)(p[+0] & 0377) << 010 | + (uint32_t)(p[-1] & 0377) << 000) & + 0xC0C0C000) == 0x80808000) { + reason = "overlong utf-8 0..0xffff"; + goto StringFailureWithReason; + } + // fallthrough + case UTF8_4: + if (p + 3 <= e && // + ((A = ((uint32_t)(p[+2] & 0377) << 030 | // + (uint32_t)(p[+1] & 0377) << 020 | // + (uint32_t)(p[+0] & 0377) << 010 | // + (uint32_t)(p[-1] & 0377) << 000)) & // + 0xC0C0C000) == 0x80808000) { // + A = (A & 7) << 18 | // + (A & (077 << 010)) << (12 - 010) | // + (A & (077 << 020)) >> -(6 - 020) | // + (A & (077 << 030)) >> 030; // + if (A <= 0x10FFFF) { + c = A; + p += 3; + goto EncodeUtf8; + } else { + reason = "utf-8 exceeds utf-16 range"; + goto StringFailureWithReason; + } + } else { + reason = "malformed utf-8"; + goto StringFailureWithReason; + } + + case EVILUTF8: + if (p < e && // + (p[0] & 0300) == 0200) { // + reason = "overlong ascii"; + goto StringFailureWithReason; + } + // fallthrough + case BADUTF8: + reason = "illegal utf-8 character"; goto StringFailureWithReason; + + case C0: + reason = "non-del c0 control code in string"; + goto StringFailureWithReason; + + case C1: + reason = "c1 control code in string"; + goto StringFailureWithReason; + + default: + unreachable; } } unreachable;