Audit every single JSON test

This commit is contained in:
Justine Tunney 2022-07-12 12:30:42 -07:00
parent 7965ed0232
commit 3f3e7e92d7
17 changed files with 473 additions and 285 deletions

View file

@ -679,21 +679,61 @@ FUNCTIONS
├─→ double
├─→ array
├─→ object
├─→ false
├─→ true
├─→ nil
└─→ nil, error:str
Turns JSON string into a Lua data structure.
This is a very permissive parser. That means it should always
parse correctly formatted JSON correctly. However it will not
complain if the `input` string is weirdly formatted. There is
currently no validation performed, other than what we need to
ensure security. For example `{3=4}` will decode as `{[3]=4}`
even though that structure won't round-trip with `EncodeJson`
since redbean won't generate invalid JSON (see Postel's Law).
This is a generally permissive parser, in the sense that like
v8, it permits scalars as top-level values. Therefore we must
note that this API can be thought of as special, in the sense
This parser permits top-level values regardless of type, with
the exception of `false`, `null`, and absent.
val = assert(DecodeJson(str))
will usually do the right thing, except in cases where false
or null are the top-level value. In those cases, it's needed
to check the second value too in order to discern from error
val, err = DecodeJson(str)
if not val then
if err then
print('bad json', err)
elseif val == nil then
print('val is null')
elseif val == false then
print('val is false')
end
end
This parser supports 64-bit signed integers. If an overflow
happens, then the integer is silently coerced to double, as
consistent with v8. If a double overflows into Infinity, we
coerce it to `null` since that's what v8 does, and the same
goes for underflows which, like v8, are coerced to 0.0.
This parser does not validate UTF-8 which is copied how the
JSON specifies. It may therefore contain underlong overlong
characters, trojan source and even numbers banned the IETF.
You can use VisualizeControlCodes() and Underlong(), to see
if a string round-trips, to detect these weirdo codepoints.
This parser does some validation of UTF-16. Consistent with
v8, bad surrogate characters will be silently preserved, as
their original escape sequence text. Thereby ensuring utf-8
output is valid. Please note that invalid utf-8 could still
happen if it's encoded as utf-8.
This parser is lenient about commas and colons. For example
it's permissible to say `DecodeJson('[1 2 3 4]')`. Trailing
commas are allowed. Even prefix commas are allowed. However
it's not recommended that you rely on this behavior, and it
won't round-trip with EncodeJson() currently.
When objects are parsed, your Lua object can't preserve the
the original ordering of fields. As such, they'll be sorted
by EncodeJson() and may not round-trip with original intent
EncodeJson(value[,options:table])
├─→ json:str
@ -726,6 +766,8 @@ FUNCTIONS
When arrays and objects are serialized, entries will be sorted
in a deterministic order.
This parser does not support UTF-8
EncodeLua(value[,options:table])
├─→ luacode:str
├─→ true [if useoutput]
@ -1385,10 +1427,10 @@ FUNCTIONS
access log and message logging.
VisualizeControlCodes(str) → str
Replaces C0 control codes with their UNICODE pictures
representation. This function also canonicalizes overlong
encodings. C1 control codes are replaced with a JavaScript-like
escape sequence.
Replaces C0 control codes and trojan source characters with
descriptive UNICODE pictorial representation. This function
also canonicalizes overlong encodings. C1 control codes are
replaced with a JavaScript-like escape sequence.
Underlong(str) → str
Canonicalizes overlong encodings.

View file

@ -19,7 +19,9 @@
#include "libc/bits/bits.h"
#include "libc/bits/likely.h"
#include "libc/intrin/kprintf.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/str/str.h"
#include "libc/str/tpenc.h"
#include "libc/str/utf16.h"
#include "third_party/double-conversion/wrapper.h"
@ -42,6 +44,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
char w[4];
const char *a;
luaL_Buffer b;
const char *reason;
struct DecodeJson r;
int A, B, C, D, c, d, i, u;
if (UNLIKELY(!--depth)) {
@ -74,9 +77,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
case 'n': // null
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
if (UNLIKELY(context == TOP_LEVEL)) {
return (struct DecodeJson){-1, "toplevel json can't be null"};
}
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) {
lua_pushnil(L);
return (struct DecodeJson){1, p + 3};
@ -86,9 +86,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
case 'f': // false
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
if (UNLIKELY(context == TOP_LEVEL)) {
return (struct DecodeJson){-1, "toplevel json can't be false"};
}
if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) {
lua_pushboolean(L, false);
return (struct DecodeJson){1, p + 4};
@ -105,15 +102,26 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
goto IllegalCharacter;
}
BadObjectKey:
return (struct DecodeJson){-1, "object key must be string"};
case '-': // negative
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
d = -1;
break;
if (p < e && isdigit(*p)) {
d = -1;
break;
} else {
return (struct DecodeJson){-1, "bad negative"};
}
case '0': // zero or number
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
if (p < e && (*p == '.' || *p == 'e' || *p == 'E')) {
goto UseDubble;
if (p < e) {
if ((*p == '.' || *p == 'e' || *p == 'E')) {
goto UseDubble;
} else if (isdigit(*p)) {
return (struct DecodeJson){-1, "unexpected octal"};
}
}
lua_pushinteger(L, 0);
return (struct DecodeJson){1, p};
@ -138,6 +146,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
UseDubble: // number
lua_pushnumber(L, StringToDouble(a, e - a, &c));
DCHECK(c > 0, "paranoid avoiding infinite loop");
return (struct DecodeJson){1, a + c};
case '[': // Array
@ -206,134 +215,146 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
case '"': // string
luaL_buffinit(L, &b);
while (p < e) {
for (;;) {
if (UNLIKELY(p >= e)) {
UnexpectedEofString:
reason = "unexpected eof in string";
goto StringFailureWithReason;
}
c = *p++ & 255;
if (c == '"') {
luaL_pushresult(&b);
return (struct DecodeJson){1, p};
} else if (c == '\\') {
goto HandleEscape;
} else if (UNLIKELY(c <= 0x1F)) {
reason = "non-del c0 in string";
goto StringFailureWithReason;
} else {
luaL_addchar(&b, c);
}
continue;
HandleEscape:
if (UNLIKELY(p >= e)) {
goto UnexpectedEofString;
}
switch ((c = *p++ & 255)) {
default:
AddByte:
case '"':
case '/':
case '\\':
luaL_addchar(&b, c);
break;
case '\\':
if (p < e) {
switch ((c = *p++ & 255)) {
default:
goto InvalidEscapeCharacter;
case '"':
case '/':
case '\\':
goto AddByte;
case 'b':
c = '\b';
goto AddByte;
case 'f':
c = '\f';
goto AddByte;
case 'n':
c = '\n';
goto AddByte;
case 'r':
c = '\r';
goto AddByte;
case 't':
c = '\t';
goto AddByte;
case 'x':
if (p + 2 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
(B = kHexToInt[p[1] & 255]) != -1) { //
c = A << 4 | B;
p += 2;
goto AddByte;
} else {
goto InvalidEscapeCharacter;
}
case 'u':
if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { //
c = A << 12 | B << 8 | C << 4 | D;
if (!IsSurrogate(c)) {
p += 4;
} else if (IsHighSurrogate(c)) {
if (p + 4 + 6 <= e && //
p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
// UTF-8
if (c < 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (UNLIKELY(IsSurrogate(c))) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else {
goto InvalidEscapeCharacter;
BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
}
break;
case 'b':
luaL_addchar(&b, '\b');
break;
case 'f':
luaL_addchar(&b, '\f');
break;
case 'n':
luaL_addchar(&b, '\n');
break;
case 'r':
luaL_addchar(&b, '\r');
break;
case 't':
luaL_addchar(&b, '\t');
break;
case 'x':
if (p + 2 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
(B = kHexToInt[p[1] & 255]) != -1) { //
c = A << 4 | B;
if (!(0x20 <= c && c <= 0x7E)) {
reason = "hex escape not printable";
goto StringFailureWithReason;
}
p += 2;
luaL_addchar(&b, c);
break;
} else {
goto InvalidEscapeCharacter;
reason = "invalid hex escape";
goto StringFailureWithReason;
}
case 'u':
if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { //
c = A << 12 | B << 8 | C << 4 | D;
if (!IsSurrogate(c)) {
p += 4;
} else if (IsHighSurrogate(c)) {
if (p + 4 + 6 <= e && //
p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
// UTF-8
if (c < 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (UNLIKELY(IsSurrogate(c))) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else {
reason = "invalid unicode escape";
goto StringFailureWithReason;
BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
}
break;
case '"':
luaL_pushresult(&b);
return (struct DecodeJson){1, p};
default:
reason = "invalid escape character";
goto StringFailureWithReason;
}
}
break;
StringFailureWithReason:
luaL_pushresultsize(&b, 0);
lua_pop(L, 1);
return (struct DecodeJson){-1, "unexpected eof in string"};
return (struct DecodeJson){-1, reason};
default:
IllegalCharacter:
return (struct DecodeJson){-1, "illegal character"};
BadObjectKey:
return (struct DecodeJson){-1, "object key must be string"};
InvalidEscapeCharacter:
luaL_pushresultsize(&b, 0);
lua_pop(L, 1);
return (struct DecodeJson){-1, "invalid escape character"};
}
}
if (UNLIKELY(context == TOP_LEVEL)) {
@ -357,16 +378,14 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
* converted to a floating-point number instead. Invalid surrogate
* escape sequences in strings won't be decoded.
*
* A weird case exists when parsing empty objects. In order to let Lua
* tell them apart from empty arrays, we insert a special key that's
* ignored by our JSON serializer, called `[__json_object__]=true`.
*
* @param L is Lua interpreter state
* @param p is input string
* @param n is byte length of `p` or -1 for automatic strlen()
* @return res.rc is 1 if value pushed, 0 on eof, otherwise -1
* @return res.p is is advanced `p` pointer if `rc` isn't -1
* @return res.p is string describing error if `rc` is -1
* @return r.rc is 1 if value is pushed on lua stack
* @return r.rc is 0 on eof
* @return r.rc is -1 on error
* @return r.p is is advanced `p` pointer if `rc 0`
* @return r.p is string describing error if `rc < 0`
*/
struct DecodeJson DecodeJson(struct lua_State *L, const char *p, size_t n) {
if (n == -1) n = p ? strlen(p) : 0;