mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-15 23:29:56 +00:00
Audit every single JSON test
This commit is contained in:
parent
7965ed0232
commit
3f3e7e92d7
17 changed files with 473 additions and 285 deletions
|
@ -679,21 +679,61 @@ FUNCTIONS
|
|||
├─→ double
|
||||
├─→ array
|
||||
├─→ object
|
||||
├─→ false
|
||||
├─→ true
|
||||
├─→ nil
|
||||
└─→ nil, error:str
|
||||
|
||||
Turns JSON string into a Lua data structure.
|
||||
|
||||
This is a very permissive parser. That means it should always
|
||||
parse correctly formatted JSON correctly. However it will not
|
||||
complain if the `input` string is weirdly formatted. There is
|
||||
currently no validation performed, other than what we need to
|
||||
ensure security. For example `{3=4}` will decode as `{[3]=4}`
|
||||
even though that structure won't round-trip with `EncodeJson`
|
||||
since redbean won't generate invalid JSON (see Postel's Law).
|
||||
This is a generally permissive parser, in the sense that like
|
||||
v8, it permits scalars as top-level values. Therefore we must
|
||||
note that this API can be thought of as special, in the sense
|
||||
|
||||
This parser permits top-level values regardless of type, with
|
||||
the exception of `false`, `null`, and absent.
|
||||
val = assert(DecodeJson(str))
|
||||
|
||||
will usually do the right thing, except in cases where false
|
||||
or null are the top-level value. In those cases, it's needed
|
||||
to check the second value too in order to discern from error
|
||||
|
||||
val, err = DecodeJson(str)
|
||||
if not val then
|
||||
if err then
|
||||
print('bad json', err)
|
||||
elseif val == nil then
|
||||
print('val is null')
|
||||
elseif val == false then
|
||||
print('val is false')
|
||||
end
|
||||
end
|
||||
|
||||
This parser supports 64-bit signed integers. If an overflow
|
||||
happens, then the integer is silently coerced to double, as
|
||||
consistent with v8. If a double overflows into Infinity, we
|
||||
coerce it to `null` since that's what v8 does, and the same
|
||||
goes for underflows which, like v8, are coerced to 0.0.
|
||||
|
||||
This parser does not validate UTF-8 which is copied how the
|
||||
JSON specifies. It may therefore contain underlong overlong
|
||||
characters, trojan source and even numbers banned the IETF.
|
||||
You can use VisualizeControlCodes() and Underlong(), to see
|
||||
if a string round-trips, to detect these weirdo codepoints.
|
||||
|
||||
This parser does some validation of UTF-16. Consistent with
|
||||
v8, bad surrogate characters will be silently preserved, as
|
||||
their original escape sequence text. Thereby ensuring utf-8
|
||||
output is valid. Please note that invalid utf-8 could still
|
||||
happen if it's encoded as utf-8.
|
||||
|
||||
This parser is lenient about commas and colons. For example
|
||||
it's permissible to say `DecodeJson('[1 2 3 4]')`. Trailing
|
||||
commas are allowed. Even prefix commas are allowed. However
|
||||
it's not recommended that you rely on this behavior, and it
|
||||
won't round-trip with EncodeJson() currently.
|
||||
|
||||
When objects are parsed, your Lua object can't preserve the
|
||||
the original ordering of fields. As such, they'll be sorted
|
||||
by EncodeJson() and may not round-trip with original intent
|
||||
|
||||
EncodeJson(value[,options:table])
|
||||
├─→ json:str
|
||||
|
@ -726,6 +766,8 @@ FUNCTIONS
|
|||
When arrays and objects are serialized, entries will be sorted
|
||||
in a deterministic order.
|
||||
|
||||
This parser does not support UTF-8
|
||||
|
||||
EncodeLua(value[,options:table])
|
||||
├─→ luacode:str
|
||||
├─→ true [if useoutput]
|
||||
|
@ -1385,10 +1427,10 @@ FUNCTIONS
|
|||
access log and message logging.
|
||||
|
||||
VisualizeControlCodes(str) → str
|
||||
Replaces C0 control codes with their UNICODE pictures
|
||||
representation. This function also canonicalizes overlong
|
||||
encodings. C1 control codes are replaced with a JavaScript-like
|
||||
escape sequence.
|
||||
Replaces C0 control codes and trojan source characters with
|
||||
descriptive UNICODE pictorial representation. This function
|
||||
also canonicalizes overlong encodings. C1 control codes are
|
||||
replaced with a JavaScript-like escape sequence.
|
||||
|
||||
Underlong(str) → str
|
||||
Canonicalizes overlong encodings.
|
||||
|
|
279
tool/net/ljson.c
279
tool/net/ljson.c
|
@ -19,7 +19,9 @@
|
|||
#include "libc/bits/bits.h"
|
||||
#include "libc/bits/likely.h"
|
||||
#include "libc/intrin/kprintf.h"
|
||||
#include "libc/log/check.h"
|
||||
#include "libc/log/log.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/str/tpenc.h"
|
||||
#include "libc/str/utf16.h"
|
||||
#include "third_party/double-conversion/wrapper.h"
|
||||
|
@ -42,6 +44,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
char w[4];
|
||||
const char *a;
|
||||
luaL_Buffer b;
|
||||
const char *reason;
|
||||
struct DecodeJson r;
|
||||
int A, B, C, D, c, d, i, u;
|
||||
if (UNLIKELY(!--depth)) {
|
||||
|
@ -74,9 +77,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
|
||||
case 'n': // null
|
||||
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
|
||||
if (UNLIKELY(context == TOP_LEVEL)) {
|
||||
return (struct DecodeJson){-1, "toplevel json can't be null"};
|
||||
}
|
||||
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) {
|
||||
lua_pushnil(L);
|
||||
return (struct DecodeJson){1, p + 3};
|
||||
|
@ -86,9 +86,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
|
||||
case 'f': // false
|
||||
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
|
||||
if (UNLIKELY(context == TOP_LEVEL)) {
|
||||
return (struct DecodeJson){-1, "toplevel json can't be false"};
|
||||
}
|
||||
if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) {
|
||||
lua_pushboolean(L, false);
|
||||
return (struct DecodeJson){1, p + 4};
|
||||
|
@ -105,15 +102,26 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
goto IllegalCharacter;
|
||||
}
|
||||
|
||||
BadObjectKey:
|
||||
return (struct DecodeJson){-1, "object key must be string"};
|
||||
|
||||
case '-': // negative
|
||||
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
|
||||
d = -1;
|
||||
break;
|
||||
if (p < e && isdigit(*p)) {
|
||||
d = -1;
|
||||
break;
|
||||
} else {
|
||||
return (struct DecodeJson){-1, "bad negative"};
|
||||
}
|
||||
|
||||
case '0': // zero or number
|
||||
if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
|
||||
if (p < e && (*p == '.' || *p == 'e' || *p == 'E')) {
|
||||
goto UseDubble;
|
||||
if (p < e) {
|
||||
if ((*p == '.' || *p == 'e' || *p == 'E')) {
|
||||
goto UseDubble;
|
||||
} else if (isdigit(*p)) {
|
||||
return (struct DecodeJson){-1, "unexpected octal"};
|
||||
}
|
||||
}
|
||||
lua_pushinteger(L, 0);
|
||||
return (struct DecodeJson){1, p};
|
||||
|
@ -138,6 +146,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
|
||||
UseDubble: // number
|
||||
lua_pushnumber(L, StringToDouble(a, e - a, &c));
|
||||
DCHECK(c > 0, "paranoid avoiding infinite loop");
|
||||
return (struct DecodeJson){1, a + c};
|
||||
|
||||
case '[': // Array
|
||||
|
@ -206,134 +215,146 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
|
||||
case '"': // string
|
||||
luaL_buffinit(L, &b);
|
||||
while (p < e) {
|
||||
for (;;) {
|
||||
if (UNLIKELY(p >= e)) {
|
||||
UnexpectedEofString:
|
||||
reason = "unexpected eof in string";
|
||||
goto StringFailureWithReason;
|
||||
}
|
||||
c = *p++ & 255;
|
||||
if (c == '"') {
|
||||
luaL_pushresult(&b);
|
||||
return (struct DecodeJson){1, p};
|
||||
} else if (c == '\\') {
|
||||
goto HandleEscape;
|
||||
} else if (UNLIKELY(c <= 0x1F)) {
|
||||
reason = "non-del c0 in string";
|
||||
goto StringFailureWithReason;
|
||||
} else {
|
||||
luaL_addchar(&b, c);
|
||||
}
|
||||
continue;
|
||||
HandleEscape:
|
||||
if (UNLIKELY(p >= e)) {
|
||||
goto UnexpectedEofString;
|
||||
}
|
||||
switch ((c = *p++ & 255)) {
|
||||
default:
|
||||
AddByte:
|
||||
case '"':
|
||||
case '/':
|
||||
case '\\':
|
||||
luaL_addchar(&b, c);
|
||||
break;
|
||||
case '\\':
|
||||
if (p < e) {
|
||||
switch ((c = *p++ & 255)) {
|
||||
default:
|
||||
goto InvalidEscapeCharacter;
|
||||
case '"':
|
||||
case '/':
|
||||
case '\\':
|
||||
goto AddByte;
|
||||
case 'b':
|
||||
c = '\b';
|
||||
goto AddByte;
|
||||
case 'f':
|
||||
c = '\f';
|
||||
goto AddByte;
|
||||
case 'n':
|
||||
c = '\n';
|
||||
goto AddByte;
|
||||
case 'r':
|
||||
c = '\r';
|
||||
goto AddByte;
|
||||
case 't':
|
||||
c = '\t';
|
||||
goto AddByte;
|
||||
case 'x':
|
||||
if (p + 2 <= e && //
|
||||
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
|
||||
(B = kHexToInt[p[1] & 255]) != -1) { //
|
||||
c = A << 4 | B;
|
||||
p += 2;
|
||||
goto AddByte;
|
||||
} else {
|
||||
goto InvalidEscapeCharacter;
|
||||
}
|
||||
case 'u':
|
||||
if (p + 4 <= e && //
|
||||
(A = kHexToInt[p[0] & 255]) != -1 && //
|
||||
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
|
||||
(C = kHexToInt[p[2] & 255]) != -1 && //
|
||||
(D = kHexToInt[p[3] & 255]) != -1) { //
|
||||
c = A << 12 | B << 8 | C << 4 | D;
|
||||
if (!IsSurrogate(c)) {
|
||||
p += 4;
|
||||
} else if (IsHighSurrogate(c)) {
|
||||
if (p + 4 + 6 <= e && //
|
||||
p[4] == '\\' && //
|
||||
p[5] == 'u' && //
|
||||
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
|
||||
(B = kHexToInt[p[7] & 255]) != -1 && //
|
||||
(C = kHexToInt[p[8] & 255]) != -1 && //
|
||||
(D = kHexToInt[p[9] & 255]) != -1) { //
|
||||
u = A << 12 | B << 8 | C << 4 | D;
|
||||
if (IsLowSurrogate(u)) {
|
||||
p += 4 + 6;
|
||||
c = MergeUtf16(c, u);
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
// UTF-8
|
||||
if (c < 0x7f) {
|
||||
w[0] = c;
|
||||
i = 1;
|
||||
} else if (c <= 0x7ff) {
|
||||
w[0] = 0300 | (c >> 6);
|
||||
w[1] = 0200 | (c & 077);
|
||||
i = 2;
|
||||
} else if (c <= 0xffff) {
|
||||
if (UNLIKELY(IsSurrogate(c))) {
|
||||
ReplacementCharacter:
|
||||
c = 0xfffd;
|
||||
}
|
||||
w[0] = 0340 | (c >> 12);
|
||||
w[1] = 0200 | ((c >> 6) & 077);
|
||||
w[2] = 0200 | (c & 077);
|
||||
i = 3;
|
||||
} else if (~(c >> 18) & 007) {
|
||||
w[0] = 0360 | (c >> 18);
|
||||
w[1] = 0200 | ((c >> 12) & 077);
|
||||
w[2] = 0200 | ((c >> 6) & 077);
|
||||
w[3] = 0200 | (c & 077);
|
||||
i = 4;
|
||||
} else {
|
||||
goto ReplacementCharacter;
|
||||
}
|
||||
luaL_addlstring(&b, w, i);
|
||||
} else {
|
||||
goto InvalidEscapeCharacter;
|
||||
BadUnicode:
|
||||
// Echo invalid \uXXXX sequences
|
||||
// Rather than corrupting UTF-8!
|
||||
luaL_addstring(&b, "\\u");
|
||||
}
|
||||
break;
|
||||
case 'b':
|
||||
luaL_addchar(&b, '\b');
|
||||
break;
|
||||
case 'f':
|
||||
luaL_addchar(&b, '\f');
|
||||
break;
|
||||
case 'n':
|
||||
luaL_addchar(&b, '\n');
|
||||
break;
|
||||
case 'r':
|
||||
luaL_addchar(&b, '\r');
|
||||
break;
|
||||
case 't':
|
||||
luaL_addchar(&b, '\t');
|
||||
break;
|
||||
case 'x':
|
||||
if (p + 2 <= e && //
|
||||
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
|
||||
(B = kHexToInt[p[1] & 255]) != -1) { //
|
||||
c = A << 4 | B;
|
||||
if (!(0x20 <= c && c <= 0x7E)) {
|
||||
reason = "hex escape not printable";
|
||||
goto StringFailureWithReason;
|
||||
}
|
||||
p += 2;
|
||||
luaL_addchar(&b, c);
|
||||
break;
|
||||
} else {
|
||||
goto InvalidEscapeCharacter;
|
||||
reason = "invalid hex escape";
|
||||
goto StringFailureWithReason;
|
||||
}
|
||||
case 'u':
|
||||
if (p + 4 <= e && //
|
||||
(A = kHexToInt[p[0] & 255]) != -1 && //
|
||||
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
|
||||
(C = kHexToInt[p[2] & 255]) != -1 && //
|
||||
(D = kHexToInt[p[3] & 255]) != -1) { //
|
||||
c = A << 12 | B << 8 | C << 4 | D;
|
||||
if (!IsSurrogate(c)) {
|
||||
p += 4;
|
||||
} else if (IsHighSurrogate(c)) {
|
||||
if (p + 4 + 6 <= e && //
|
||||
p[4] == '\\' && //
|
||||
p[5] == 'u' && //
|
||||
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
|
||||
(B = kHexToInt[p[7] & 255]) != -1 && //
|
||||
(C = kHexToInt[p[8] & 255]) != -1 && //
|
||||
(D = kHexToInt[p[9] & 255]) != -1) { //
|
||||
u = A << 12 | B << 8 | C << 4 | D;
|
||||
if (IsLowSurrogate(u)) {
|
||||
p += 4 + 6;
|
||||
c = MergeUtf16(c, u);
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
} else {
|
||||
goto BadUnicode;
|
||||
}
|
||||
// UTF-8
|
||||
if (c < 0x7f) {
|
||||
w[0] = c;
|
||||
i = 1;
|
||||
} else if (c <= 0x7ff) {
|
||||
w[0] = 0300 | (c >> 6);
|
||||
w[1] = 0200 | (c & 077);
|
||||
i = 2;
|
||||
} else if (c <= 0xffff) {
|
||||
if (UNLIKELY(IsSurrogate(c))) {
|
||||
ReplacementCharacter:
|
||||
c = 0xfffd;
|
||||
}
|
||||
w[0] = 0340 | (c >> 12);
|
||||
w[1] = 0200 | ((c >> 6) & 077);
|
||||
w[2] = 0200 | (c & 077);
|
||||
i = 3;
|
||||
} else if (~(c >> 18) & 007) {
|
||||
w[0] = 0360 | (c >> 18);
|
||||
w[1] = 0200 | ((c >> 12) & 077);
|
||||
w[2] = 0200 | ((c >> 6) & 077);
|
||||
w[3] = 0200 | (c & 077);
|
||||
i = 4;
|
||||
} else {
|
||||
goto ReplacementCharacter;
|
||||
}
|
||||
luaL_addlstring(&b, w, i);
|
||||
} else {
|
||||
reason = "invalid unicode escape";
|
||||
goto StringFailureWithReason;
|
||||
BadUnicode:
|
||||
// Echo invalid \uXXXX sequences
|
||||
// Rather than corrupting UTF-8!
|
||||
luaL_addstring(&b, "\\u");
|
||||
}
|
||||
break;
|
||||
case '"':
|
||||
luaL_pushresult(&b);
|
||||
return (struct DecodeJson){1, p};
|
||||
default:
|
||||
reason = "invalid escape character";
|
||||
goto StringFailureWithReason;
|
||||
}
|
||||
}
|
||||
break;
|
||||
StringFailureWithReason:
|
||||
luaL_pushresultsize(&b, 0);
|
||||
lua_pop(L, 1);
|
||||
return (struct DecodeJson){-1, "unexpected eof in string"};
|
||||
return (struct DecodeJson){-1, reason};
|
||||
|
||||
default:
|
||||
IllegalCharacter:
|
||||
return (struct DecodeJson){-1, "illegal character"};
|
||||
BadObjectKey:
|
||||
return (struct DecodeJson){-1, "object key must be string"};
|
||||
InvalidEscapeCharacter:
|
||||
luaL_pushresultsize(&b, 0);
|
||||
lua_pop(L, 1);
|
||||
return (struct DecodeJson){-1, "invalid escape character"};
|
||||
}
|
||||
}
|
||||
if (UNLIKELY(context == TOP_LEVEL)) {
|
||||
|
@ -357,16 +378,14 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
|
|||
* converted to a floating-point number instead. Invalid surrogate
|
||||
* escape sequences in strings won't be decoded.
|
||||
*
|
||||
* A weird case exists when parsing empty objects. In order to let Lua
|
||||
* tell them apart from empty arrays, we insert a special key that's
|
||||
* ignored by our JSON serializer, called `[__json_object__]=true`.
|
||||
*
|
||||
* @param L is Lua interpreter state
|
||||
* @param p is input string
|
||||
* @param n is byte length of `p` or -1 for automatic strlen()
|
||||
* @return res.rc is 1 if value pushed, 0 on eof, otherwise -1
|
||||
* @return res.p is is advanced `p` pointer if `rc` isn't -1
|
||||
* @return res.p is string describing error if `rc` is -1
|
||||
* @return r.rc is 1 if value is pushed on lua stack
|
||||
* @return r.rc is 0 on eof
|
||||
* @return r.rc is -1 on error
|
||||
* @return r.p is is advanced `p` pointer if `rc ≥ 0`
|
||||
* @return r.p is string describing error if `rc < 0`
|
||||
*/
|
||||
struct DecodeJson DecodeJson(struct lua_State *L, const char *p, size_t n) {
|
||||
if (n == -1) n = p ? strlen(p) : 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue