Add utf-8 validation to ljson

This commit is contained in:
Justine Tunney 2022-07-15 06:18:32 -07:00
parent ccd057a85d
commit baf51a4a23
4 changed files with 342 additions and 177 deletions

View file

@ -713,24 +713,14 @@ FUNCTIONS
coerce it to `null` since that's what v8 does, and the same
goes for underflows which, like v8, are coerced to 0.0.
This parser does not validate UTF-8 which is copied how the
JSON specifies. It may therefore contain underlong overlong
characters, trojan source and even numbers banned the IETF.
You can use VisualizeControlCodes() and Underlong(), to see
if a string round-trips, to detect these weirdo codepoints.
This parser does some validation of UTF-16. Consistent with
v8, bad surrogate characters will be silently preserved, as
their original escape sequence text. Thereby ensuring utf-8
output is valid. Please note that invalid utf-8 could still
happen if it's encoded as utf-8.
When objects are parsed, your Lua object can't preserve the
the original ordering of fields. As such, they'll be sorted
by EncodeJson() and may not round-trip with original intent
This parser has perfect conformance with JSONTestSuite.
This parser validates utf-8 and utf-16.
EncodeJson(value[, options:table])
├─→ json:str
├─→ true [if useoutput]

View file

@ -39,6 +39,55 @@
#define OBJECT 16
#define DEPTH 64
#define ASCII 0
#define C0 1
#define DQUOTE 2
#define BACKSLASH 3
#define UTF8_2 4
#define UTF8_3 5
#define UTF8_4 6
#define C1 7
#define UTF8_3_E0 8
#define UTF8_3_ED 9
#define UTF8_4_F0 10
#define BADUTF8 11
#define EVILUTF8 12
static const char kJsonStr[256] = {
1, 1, 1, 1, 1, 1, 1, 1, // 0000 ascii (0)
1, 1, 1, 1, 1, 1, 1, 1, // 0010
1, 1, 1, 1, 1, 1, 1, 1, // 0020 c0 (1)
1, 1, 1, 1, 1, 1, 1, 1, // 0030
0, 0, 2, 0, 0, 0, 0, 0, // 0040 dquote (2)
0, 0, 0, 0, 0, 0, 0, 0, // 0050
0, 0, 0, 0, 0, 0, 0, 0, // 0060
0, 0, 0, 0, 0, 0, 0, 0, // 0070
0, 0, 0, 0, 0, 0, 0, 0, // 0100
0, 0, 0, 0, 0, 0, 0, 0, // 0110
0, 0, 0, 0, 0, 0, 0, 0, // 0120
0, 0, 0, 0, 3, 0, 0, 0, // 0130 backslash (3)
0, 0, 0, 0, 0, 0, 0, 0, // 0140
0, 0, 0, 0, 0, 0, 0, 0, // 0150
0, 0, 0, 0, 0, 0, 0, 0, // 0160
0, 0, 0, 0, 0, 0, 0, 0, // 0170
7, 7, 7, 7, 7, 7, 7, 7, // 0200 c1 (8)
7, 7, 7, 7, 7, 7, 7, 7, // 0210
7, 7, 7, 7, 7, 7, 7, 7, // 0220
7, 7, 7, 7, 7, 7, 7, 7, // 0230
11, 11, 11, 11, 11, 11, 11, 11, // 0240 latin1 (4)
11, 11, 11, 11, 11, 11, 11, 11, // 0250
11, 11, 11, 11, 11, 11, 11, 11, // 0260
11, 11, 11, 11, 11, 11, 11, 11, // 0270
12, 12, 4, 4, 4, 4, 4, 4, // 0300 utf8-2 (5)
4, 4, 4, 4, 4, 4, 4, 4, // 0310
4, 4, 4, 4, 4, 4, 4, 4, // 0320 utf8-2
4, 4, 4, 4, 4, 4, 4, 4, // 0330
8, 5, 5, 5, 5, 5, 5, 5, // 0340 utf8-3 (6)
5, 5, 5, 5, 5, 9, 5, 5, // 0350
10, 6, 6, 6, 6, 11, 11, 11, // 0360 utf8-4 (7)
11, 11, 11, 11, 11, 11, 11, 11, // 0370
};
static struct DecodeJson Parse(struct lua_State *L, const char *p,
const char *e, int context, int depth) {
long x;
@ -251,129 +300,251 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
reason = "unexpected eof in string";
goto StringFailureWithReason;
}
c = *p++ & 255;
if (c == '"') {
luaL_pushresult(&b);
return (struct DecodeJson){1, p};
} else if (c == '\\') {
goto HandleEscape;
} else if (UNLIKELY(c <= 0x1F)) {
reason = "non-del c0 in string";
goto StringFailureWithReason;
} else {
luaL_addchar(&b, c);
}
continue;
HandleEscape:
if (p >= e) {
goto UnexpectedEofString;
}
switch ((c = *p++ & 255)) {
case '"':
case '/':
case '\\':
switch (kJsonStr[(c = *p++ & 255)]) {
case ASCII:
luaL_addchar(&b, c);
break;
case 'b':
luaL_addchar(&b, '\b');
break;
case 'f':
luaL_addchar(&b, '\f');
break;
case 'n':
luaL_addchar(&b, '\n');
break;
case 'r':
luaL_addchar(&b, '\r');
break;
case 't':
luaL_addchar(&b, '\t');
break;
case 'x':
if (p + 2 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
(B = kHexToInt[p[1] & 255]) != -1) { //
c = A << 4 | B;
if (!(0x20 <= c && c <= 0x7E)) {
reason = "hex escape not printable";
goto StringFailureWithReason;
}
p += 2;
luaL_addchar(&b, c);
break;
} else {
reason = "invalid hex escape";
goto StringFailureWithReason;
case DQUOTE:
luaL_pushresult(&b);
return (struct DecodeJson){1, p};
case BACKSLASH:
if (p >= e) {
goto UnexpectedEofString;
}
case 'u':
if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { //
c = A << 12 | B << 8 | C << 4 | D;
if (!IsSurrogate(c)) {
p += 4;
} else if (IsHighSurrogate(c)) {
if (p + 4 + 6 <= e && //
p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
switch ((c = *p++ & 255)) {
case '"':
case '/':
case '\\':
luaL_addchar(&b, c);
break;
case 'b':
luaL_addchar(&b, '\b');
break;
case 'f':
luaL_addchar(&b, '\f');
break;
case 'n':
luaL_addchar(&b, '\n');
break;
case 'r':
luaL_addchar(&b, '\r');
break;
case 't':
luaL_addchar(&b, '\t');
break;
case 'x':
if (p + 2 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
(B = kHexToInt[p[1] & 255]) != -1) { //
c = A << 4 | B;
if (!(0x20 <= c && c <= 0x7E)) {
reason = "hex escape not printable";
goto StringFailureWithReason;
}
p += 2;
luaL_addchar(&b, c);
break;
} else {
reason = "invalid hex escape";
goto StringFailureWithReason;
}
case 'u':
if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { //
c = A << 12 | B << 8 | C << 4 | D;
if (!IsSurrogate(c)) {
p += 4;
} else if (IsHighSurrogate(c)) {
if (p + 4 + 6 <= e && //
p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
// UTF-8
EncodeUtf8:
if (c <= 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (IsSurrogate(c)) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else {
goto BadUnicode;
reason = "invalid unicode escape";
goto StringFailureWithReason;
BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
}
} else {
goto BadUnicode;
}
// UTF-8
if (c <= 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (IsSurrogate(c)) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else {
reason = "invalid unicode escape";
goto StringFailureWithReason;
BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
break;
default:
reason = "invalid escape character";
goto StringFailureWithReason;
}
break;
default:
reason = "invalid escape character";
case UTF8_2:
if (p < e && //
(p[0] & 0300) == 0200) { //
c = (c & 037) << 6 | //
(p[0] & 077); //
p += 1;
goto EncodeUtf8;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case UTF8_3_E0:
if (p + 2 <= e && //
(p[0] & 0377) < 0240 && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) {
reason = "overlong utf-8 0..0x7ff";
goto StringFailureWithReason;
}
// fallthrough
case UTF8_3:
ThreeUtf8:
if (p + 2 <= e && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
c = (c & 017) << 12 | //
(p[0] & 077) << 6 | //
(p[1] & 077); //
p += 2;
goto EncodeUtf8;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case UTF8_3_ED:
if (p + 2 <= e && //
(p[0] & 0377) >= 0240) { //
if (p + 5 <= e && //
(p[0] & 0377) >= 0256 && //
(p[1] & 0300) == 0200 && //
(p[2] & 0377) == 0355 && //
(p[3] & 0377) >= 0260 && //
(p[4] & 0300) == 0200) { //
A = (0355 & 017) << 12 | // CESU-8
(p[0] & 077) << 6 | //
(p[1] & 077); //
B = (0355 & 017) << 12 | //
(p[3] & 077) << 6 | //
(p[4] & 077); //
c = ((A - 0xDB80) << 10) + //
((B - 0xDC00) + 0x10000); //
goto EncodeUtf8;
} else if ((p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
reason = "utf-16 surrogate in utf-8";
goto StringFailureWithReason;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
}
goto ThreeUtf8;
case UTF8_4_F0:
if (p + 3 <= e && (p[0] & 0377) < 0220 &&
(((uint32_t)(p[+2] & 0377) << 030 |
(uint32_t)(p[+1] & 0377) << 020 |
(uint32_t)(p[+0] & 0377) << 010 |
(uint32_t)(p[-1] & 0377) << 000) &
0xC0C0C000) == 0x80808000) {
reason = "overlong utf-8 0..0xffff";
goto StringFailureWithReason;
}
// fallthrough
case UTF8_4:
if (p + 3 <= e && //
((A = ((uint32_t)(p[+2] & 0377) << 030 | //
(uint32_t)(p[+1] & 0377) << 020 | //
(uint32_t)(p[+0] & 0377) << 010 | //
(uint32_t)(p[-1] & 0377) << 000)) & //
0xC0C0C000) == 0x80808000) { //
A = (A & 7) << 18 | //
(A & (077 << 010)) << (12 - 010) | //
(A & (077 << 020)) >> -(6 - 020) | //
(A & (077 << 030)) >> 030; //
if (A <= 0x10FFFF) {
c = A;
p += 3;
goto EncodeUtf8;
} else {
reason = "utf-8 exceeds utf-16 range";
goto StringFailureWithReason;
}
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case EVILUTF8:
if (p < e && //
(p[0] & 0300) == 0200) { //
reason = "overlong ascii";
goto StringFailureWithReason;
}
// fallthrough
case BADUTF8:
reason = "illegal utf-8 character";
goto StringFailureWithReason;
case C0:
reason = "non-del c0 control code in string";
goto StringFailureWithReason;
case C1:
reason = "c1 control code in string";
goto StringFailureWithReason;
default:
unreachable;
}
}
unreachable;