cosmopolitan/tool/net/ljson.c
2024-08-04 12:52:25 -07:00

610 lines
22 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "tool/net/ljson.h"
#include "libc/assert.h"
#include "libc/ctype.h"
#include "libc/intrin/likely.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/serialize.h"
#include "libc/stdckdint.h"
#include "libc/str/str.h"
#include "libc/str/tab.h"
#include "libc/str/utf16.h"
#include "libc/sysv/consts/auxv.h"
#include "libc/thread/thread.h"
#include "third_party/double-conversion/wrapper.h"
#include "third_party/lua/cosmo.h"
#include "third_party/lua/lauxlib.h"
#include "third_party/lua/ltests.h"
#include "third_party/lua/lua.h"
#define KEY 1
#define COMMA 2
#define COLON 4
#define ARRAY 8
#define OBJECT 16
#define DEPTH 64
#define ASCII 0
#define C0 1
#define DQUOTE 2
#define BACKSLASH 3
#define UTF8_2 4
#define UTF8_3 5
#define UTF8_4 6
#define C1 7
#define UTF8_3_E0 8
#define UTF8_3_ED 9
#define UTF8_4_F0 10
#define BADUTF8 11
#define EVILUTF8 12
static const char kJsonStr[256] = {
1, 1, 1, 1, 1, 1, 1, 1, // 0000 ascii (0)
1, 1, 1, 1, 1, 1, 1, 1, // 0010
1, 1, 1, 1, 1, 1, 1, 1, // 0020 c0 (1)
1, 1, 1, 1, 1, 1, 1, 1, // 0030
0, 0, 2, 0, 0, 0, 0, 0, // 0040 dquote (2)
0, 0, 0, 0, 0, 0, 0, 0, // 0050
0, 0, 0, 0, 0, 0, 0, 0, // 0060
0, 0, 0, 0, 0, 0, 0, 0, // 0070
0, 0, 0, 0, 0, 0, 0, 0, // 0100
0, 0, 0, 0, 0, 0, 0, 0, // 0110
0, 0, 0, 0, 0, 0, 0, 0, // 0120
0, 0, 0, 0, 3, 0, 0, 0, // 0130 backslash (3)
0, 0, 0, 0, 0, 0, 0, 0, // 0140
0, 0, 0, 0, 0, 0, 0, 0, // 0150
0, 0, 0, 0, 0, 0, 0, 0, // 0160
0, 0, 0, 0, 0, 0, 0, 0, // 0170
7, 7, 7, 7, 7, 7, 7, 7, // 0200 c1 (8)
7, 7, 7, 7, 7, 7, 7, 7, // 0210
7, 7, 7, 7, 7, 7, 7, 7, // 0220
7, 7, 7, 7, 7, 7, 7, 7, // 0230
11, 11, 11, 11, 11, 11, 11, 11, // 0240 latin1 (4)
11, 11, 11, 11, 11, 11, 11, 11, // 0250
11, 11, 11, 11, 11, 11, 11, 11, // 0260
11, 11, 11, 11, 11, 11, 11, 11, // 0270
12, 12, 4, 4, 4, 4, 4, 4, // 0300 utf8-2 (5)
4, 4, 4, 4, 4, 4, 4, 4, // 0310
4, 4, 4, 4, 4, 4, 4, 4, // 0320 utf8-2
4, 4, 4, 4, 4, 4, 4, 4, // 0330
8, 5, 5, 5, 5, 5, 5, 5, // 0340 utf8-3 (6)
5, 5, 5, 5, 5, 9, 5, 5, // 0350
10, 6, 6, 6, 6, 11, 11, 11, // 0360 utf8-4 (7)
11, 11, 11, 11, 11, 11, 11, 11, // 0370
};
static struct DecodeJson Parse(struct lua_State *L, const char *p,
const char *e, int context, int depth,
uintptr_t bsp) {
long x;
char w[4];
luaL_Buffer b;
struct DecodeJson r;
const char *a, *reason;
int A, B, C, D, c, d, i, u;
if (UNLIKELY(!depth))
return (struct DecodeJson){-1, "maximum depth exceeded"};
if (UNLIKELY(GetStackPointer() < bsp))
return (struct DecodeJson){-1, "out of stack"};
for (a = p, d = +1; p < e;) {
switch ((c = *p++ & 255)) {
case ' ': // spaces
case '\n':
case '\r':
case '\t':
a = p;
break;
case ',': // present in list and object
if (context & COMMA) {
context = 0;
a = p;
break;
} else {
return (struct DecodeJson){-1, "unexpected ','"};
}
case ':': // present only in object after key
if (context & COLON) {
context = 0;
a = p;
break;
} else {
return (struct DecodeJson){-1, "unexpected ':'"};
}
case 'n': // null
if (context & (KEY | COLON | COMMA))
goto OnColonCommaKey;
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) {
lua_pushnil(L);
return (struct DecodeJson){1, p + 3};
} else {
goto IllegalCharacter;
}
case 'f': // false
if (context & (KEY | COLON | COMMA))
goto OnColonCommaKey;
if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) {
lua_pushboolean(L, false);
return (struct DecodeJson){1, p + 4};
} else {
goto IllegalCharacter;
}
case 't': // true
if (context & (KEY | COLON | COMMA))
goto OnColonCommaKey;
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("true")) {
lua_pushboolean(L, true);
return (struct DecodeJson){1, p + 3};
} else {
goto IllegalCharacter;
}
default:
IllegalCharacter:
return (struct DecodeJson){-1, "illegal character"};
OnColonCommaKey:
if (context & KEY)
goto BadObjectKey;
OnColonComma:
if (context & COLON)
goto MissingColon;
return (struct DecodeJson){-1, "missing ','"};
MissingColon:
return (struct DecodeJson){-1, "missing ':'"};
BadObjectKey:
return (struct DecodeJson){-1, "object key must be string"};
case '-': // negative
if (context & (COLON | COMMA | KEY))
goto OnColonCommaKey;
if (p < e && isdigit(*p)) {
d = -1;
break;
} else {
return (struct DecodeJson){-1, "bad negative"};
}
case '0': // zero or number
if (context & (COLON | COMMA | KEY))
goto OnColonCommaKey;
if (p < e) {
if (*p == '.') {
if (p + 1 == e || !isdigit(p[1])) {
return (struct DecodeJson){-1, "bad double"};
}
goto UseDubble;
} else if (*p == 'e' || *p == 'E') {
goto UseDubble;
} else if (isdigit(*p)) {
return (struct DecodeJson){-1, "unexpected octal"};
}
}
lua_pushinteger(L, 0);
return (struct DecodeJson){1, p};
case '1' ... '9': // integer
if (context & (COLON | COMMA | KEY))
goto OnColonCommaKey;
for (x = (c - '0') * d; p < e; ++p) {
c = *p & 255;
if (isdigit(c)) {
if (ckd_mul(&x, x, 10) || ckd_add(&x, x, (c - '0') * d)) {
goto UseDubble;
}
} else if (c == '.') {
if (p + 1 == e || !isdigit(p[1])) {
return (struct DecodeJson){-1, "bad double"};
}
goto UseDubble;
} else if (c == 'e' || c == 'E') {
goto UseDubble;
} else {
break;
}
}
lua_pushinteger(L, x);
return (struct DecodeJson){1, p};
UseDubble: // number
lua_pushnumber(L, StringToDouble(a, e - a, &c));
DCHECK(c > 0, "paranoid avoiding infinite loop");
if (a + c < e && (a[c] == 'e' || a[c] == 'E')) {
return (struct DecodeJson){-1, "bad exponent"};
}
return (struct DecodeJson){1, a + c};
case '[': // Array
if (context & (COLON | COMMA | KEY))
goto OnColonCommaKey;
lua_newtable(L); // +1
for (context = ARRAY, i = 0;;) {
r = Parse(L, p, e, context, depth - 1, bsp); // +2
if (UNLIKELY(r.rc == -1)) {
lua_pop(L, 1);
return r;
}
p = r.p;
if (!r.rc) {
break;
}
lua_rawseti(L, -2, i++ + 1);
context = ARRAY | COMMA;
}
if (!i) {
// we need this kludge so `[]` won't round-trip as `{}`
lua_pushboolean(L, false); // +2
lua_rawseti(L, -2, 0);
}
return (struct DecodeJson){1, p};
case ']':
if (context & ARRAY) {
return (struct DecodeJson){0, p};
} else {
return (struct DecodeJson){-1, "unexpected ']'"};
}
case '}':
if (context & OBJECT) {
return (struct DecodeJson){0, p};
} else {
return (struct DecodeJson){-1, "unexpected '}'"};
}
case '{': // Object
if (context & (COLON | COMMA | KEY))
goto OnColonCommaKey;
lua_newtable(L); // +1
context = KEY | OBJECT;
for (;;) {
r = Parse(L, p, e, context, depth - 1, bsp); // +2
if (r.rc == -1) {
lua_pop(L, 1);
return r;
}
p = r.p;
if (!r.rc) {
return (struct DecodeJson){1, p};
}
r = Parse(L, p, e, COLON, depth - 1, bsp); // +3
if (r.rc == -1) {
lua_pop(L, 2);
return r;
}
if (!r.rc) {
lua_pop(L, 2);
return (struct DecodeJson){-1, "unexpected eof in object"};
}
p = r.p;
lua_settable(L, -3);
context = KEY | COMMA | OBJECT;
}
case '"': // string
if (context & (COLON | COMMA))
goto OnColonComma;
luaL_buffinit(L, &b);
for (;;) {
if (UNLIKELY(p >= e)) {
UnexpectedEofString:
reason = "unexpected eof in string";
goto StringFailureWithReason;
}
switch (kJsonStr[(c = *p++ & 255)]) {
case ASCII:
luaL_addchar(&b, c);
break;
case DQUOTE:
luaL_pushresult(&b);
return (struct DecodeJson){1, p};
case BACKSLASH:
if (p >= e) {
goto UnexpectedEofString;
}
switch ((c = *p++ & 255)) {
case '"':
case '/':
case '\\':
luaL_addchar(&b, c);
break;
case 'b':
luaL_addchar(&b, '\b');
break;
case 'f':
luaL_addchar(&b, '\f');
break;
case 'n':
luaL_addchar(&b, '\n');
break;
case 'r':
luaL_addchar(&b, '\r');
break;
case 't':
luaL_addchar(&b, '\t');
break;
case 'x':
if (p + 2 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // HEX
(B = kHexToInt[p[1] & 255]) != -1) { //
c = A << 4 | B;
if (!(0x20 <= c && c <= 0x7E)) {
reason = "hex escape not printable";
goto StringFailureWithReason;
}
p += 2;
luaL_addchar(&b, c);
break;
} else {
reason = "invalid hex escape";
goto StringFailureWithReason;
}
case 'u':
if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { //
c = A << 12 | B << 8 | C << 4 | D;
if (!IsSurrogate(c)) {
p += 4;
} else if (IsHighSurrogate(c)) {
if (p + 4 + 6 <= e && //
p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
// UTF-8
EncodeUtf8:
if (c <= 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (IsSurrogate(c)) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else {
reason = "invalid unicode escape";
goto StringFailureWithReason;
BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
}
break;
default:
reason = "invalid escape character";
goto StringFailureWithReason;
}
break;
case UTF8_2:
if (p < e && //
(p[0] & 0300) == 0200) { //
c = (c & 037) << 6 | //
(p[0] & 077); //
p += 1;
goto EncodeUtf8;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case UTF8_3_E0:
if (p + 2 <= e && //
(p[0] & 0377) < 0240 && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) {
reason = "overlong utf-8 0..0x7ff";
goto StringFailureWithReason;
}
// fallthrough
case UTF8_3:
ThreeUtf8:
if (p + 2 <= e && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
c = (c & 017) << 12 | //
(p[0] & 077) << 6 | //
(p[1] & 077); //
p += 2;
goto EncodeUtf8;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case UTF8_3_ED:
if (p + 2 <= e && //
(p[0] & 0377) >= 0240) { //
if (p + 5 <= e && //
(p[0] & 0377) >= 0256 && //
(p[1] & 0300) == 0200 && //
(p[2] & 0377) == 0355 && //
(p[3] & 0377) >= 0260 && //
(p[4] & 0300) == 0200) { //
A = (0355 & 017) << 12 | // CESU-8
(p[0] & 077) << 6 | //
(p[1] & 077); //
B = (0355 & 017) << 12 | //
(p[3] & 077) << 6 | //
(p[4] & 077); //
c = ((A - 0xDB80) << 10) + //
((B - 0xDC00) + 0x10000); //
goto EncodeUtf8;
} else if ((p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
reason = "utf-16 surrogate in utf-8";
goto StringFailureWithReason;
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
}
goto ThreeUtf8;
case UTF8_4_F0:
if (p + 3 <= e && (p[0] & 0377) < 0220 &&
(((uint32_t)(p[+2] & 0377) << 030 |
(uint32_t)(p[+1] & 0377) << 020 |
(uint32_t)(p[+0] & 0377) << 010 |
(uint32_t)(p[-1] & 0377) << 000) &
0xC0C0C000) == 0x80808000) {
reason = "overlong utf-8 0..0xffff";
goto StringFailureWithReason;
}
// fallthrough
case UTF8_4:
if (p + 3 <= e && //
((A = ((uint32_t)(p[+2] & 0377) << 030 | //
(uint32_t)(p[+1] & 0377) << 020 | //
(uint32_t)(p[+0] & 0377) << 010 | //
(uint32_t)(p[-1] & 0377) << 000)) & //
0xC0C0C000) == 0x80808000) { //
A = (A & 7) << 18 | //
(A & (077 << 010)) << (12 - 010) | //
(A & (077 << 020)) >> -(6 - 020) | //
(A & (077 << 030)) >> 030; //
if (A <= 0x10FFFF) {
c = A;
p += 3;
goto EncodeUtf8;
} else {
reason = "utf-8 exceeds utf-16 range";
goto StringFailureWithReason;
}
} else {
reason = "malformed utf-8";
goto StringFailureWithReason;
}
case EVILUTF8:
if (p < e && //
(p[0] & 0300) == 0200) { //
reason = "overlong ascii";
goto StringFailureWithReason;
}
// fallthrough
case BADUTF8:
reason = "illegal utf-8 character";
goto StringFailureWithReason;
case C0:
reason = "non-del c0 control code in string";
goto StringFailureWithReason;
case C1:
reason = "c1 control code in string";
goto StringFailureWithReason;
default:
__builtin_unreachable();
}
}
__builtin_unreachable();
StringFailureWithReason:
luaL_pushresultsize(&b, 0);
lua_pop(L, 1);
return (struct DecodeJson){-1, reason};
}
}
if (depth == DEPTH) {
return (struct DecodeJson){0, 0};
} else {
return (struct DecodeJson){-1, "unexpected eof"};
}
}
/**
* Parses JSON data structure string into Lua data structure.
*
* This function returns the number of items pushed to the Lua stack,
* which should be 1, unless no parseable JSON content was found, in
* which case this will return 0. On error -1 is returned. There's
* currently no error return condition. This function doesn't do JSON
* validity enforcement.
*
* JSON UTF-16 strings are re-encoded as valid UTF-8. 64-bit integers
* are supported. If an integer overflows during parsing, it'll be
* converted to a floating-point number instead. Invalid surrogate
* escape sequences in strings won't be decoded.
*
* @param L is Lua interpreter state
* @param p is input string
* @param n is byte length of `p` or -1 for automatic strlen()
* @return r.rc is 1 if value is pushed on lua stack
* @return r.rc is 0 on eof
* @return r.rc is -1 on error
* @return r.p is is advanced `p` pointer if `rc ≥ 0`
* @return r.p is string describing error if `rc < 0`
*/
struct DecodeJson DecodeJson(struct lua_State *L, const char *p, size_t n) {
if (n == -1)
n = p ? strlen(p) : 0;
uintptr_t bsp = GetStackBottom() + 4096;
if (lua_checkstack(L, DEPTH * 3 + LUA_MINSTACK)) {
return Parse(L, p, p + n, 0, DEPTH, bsp);
} else {
return (struct DecodeJson){-1, "can't set stack depth"};
}
}