Audit every single JSON test

2025-09-10 10:43:48 +00:00 · 2022-07-12 12:30:42 -07:00 · 2022-07-12 12:30:42 -07:00 · 3f3e7e92d7
commit 3f3e7e92d7
parent 7965ed0232
17 changed files with 473 additions and 285 deletions
--- a/tool/net/help.txt
+++ b/tool/net/help.txt
@ -679,21 +679,61 @@ FUNCTIONS
      ├─→ double
      ├─→ array
      ├─→ object
+      ├─→ false
      ├─→ true
+      ├─→ nil
      └─→ nil, error:str

          Turns JSON string into a Lua data structure.

-          This is a very permissive parser. That means it should always
-          parse correctly formatted JSON correctly. However it will not
-          complain if the `input` string is weirdly formatted. There is
-          currently no validation performed, other than what we need to
-          ensure security. For example `{3=4}` will decode as `{[3]=4}`
-          even though that structure won't round-trip with `EncodeJson`
-          since redbean won't generate invalid JSON (see Postel's Law).
+          This is a generally permissive parser, in the sense that like
+          v8, it permits scalars as top-level values. Therefore we must
+          note that this API can be thought of as special, in the sense

-          This parser permits top-level values regardless of type, with
-          the exception of `false`, `null`, and absent.
+              val = assert(DecodeJson(str))
+
+          will usually do the right thing, except in cases where false
+          or null are the top-level value. In those cases, it's needed
+          to check the second value too in order to discern from error
+
+              val, err = DecodeJson(str)
+              if not val then
+                 if err then
+                    print('bad json', err)
+                 elseif val == nil then
+                    print('val is null')
+                 elseif val == false then
+                    print('val is false')
+                 end
+              end
+
+          This parser supports 64-bit signed integers. If an overflow
+          happens, then the integer is silently coerced to double, as
+          consistent with v8. If a double overflows into Infinity, we
+          coerce it to `null` since that's what v8 does, and the same
+          goes for underflows which, like v8, are coerced to 0.0.
+
+          This parser does not validate UTF-8 which is copied how the
+          JSON specifies. It may therefore contain underlong overlong
+          characters, trojan source and even numbers banned the IETF.
+          You can use VisualizeControlCodes() and Underlong(), to see
+          if a string round-trips, to detect these weirdo codepoints.
+
+          This parser does some validation of UTF-16. Consistent with
+          v8, bad surrogate characters will be silently preserved, as
+          their original escape sequence text. Thereby ensuring utf-8
+          output is valid. Please note that invalid utf-8 could still
+          happen if it's encoded as utf-8.
+
+          This parser is lenient about commas and colons. For example
+          it's permissible to say `DecodeJson('[1 2 3 4]')`. Trailing
+          commas are allowed. Even prefix commas are allowed. However
+          it's not recommended that you rely on this behavior, and it
+          won't round-trip with EncodeJson() currently.
+
+          When objects are parsed, your Lua object can't preserve the
+          the original ordering of fields. As such, they'll be sorted
+          by EncodeJson() and may not round-trip with original intent

  EncodeJson(value[,options:table])
      ├─→ json:str
@ -726,6 +766,8 @@ FUNCTIONS
          When arrays and objects are serialized, entries will be sorted
          in a deterministic order.

+          This parser does not support UTF-8
+
  EncodeLua(value[,options:table])
      ├─→ luacode:str
      ├─→ true [if useoutput]
@ -1385,10 +1427,10 @@ FUNCTIONS
          access log and message logging.

  VisualizeControlCodes(str) → str
-          Replaces C0 control codes with their UNICODE pictures
-          representation. This function also canonicalizes overlong
-          encodings. C1 control codes are replaced with a JavaScript-like
-          escape sequence.
+          Replaces C0 control codes and trojan source characters with
+          descriptive UNICODE pictorial representation. This function
+          also canonicalizes overlong encodings. C1 control codes are
+          replaced with a JavaScript-like escape sequence.

  Underlong(str) → str
          Canonicalizes overlong encodings.
--- a/tool/net/ljson.c
+++ b/tool/net/ljson.c
@ -19,7 +19,9 @@
 #include "libc/bits/bits.h"
 #include "libc/bits/likely.h"
 #include "libc/intrin/kprintf.h"
+#include "libc/log/check.h"
 #include "libc/log/log.h"
+#include "libc/str/str.h"
 #include "libc/str/tpenc.h"
 #include "libc/str/utf16.h"
 #include "third_party/double-conversion/wrapper.h"
@ -42,6 +44,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
  char w[4];
  const char *a;
  luaL_Buffer b;
+  const char *reason;
  struct DecodeJson r;
  int A, B, C, D, c, d, i, u;
  if (UNLIKELY(!--depth)) {
@ -74,9 +77,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,

      case 'n':  // null
        if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
-        if (UNLIKELY(context == TOP_LEVEL)) {
-          return (struct DecodeJson){-1, "toplevel json can't be null"};
-        }
        if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) {
          lua_pushnil(L);
          return (struct DecodeJson){1, p + 3};
@ -86,9 +86,6 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,

      case 'f':  // false
        if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
-        if (UNLIKELY(context == TOP_LEVEL)) {
-          return (struct DecodeJson){-1, "toplevel json can't be false"};
-        }
        if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) {
          lua_pushboolean(L, false);
          return (struct DecodeJson){1, p + 4};
@ -105,15 +102,26 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
          goto IllegalCharacter;
        }

+      BadObjectKey:
+        return (struct DecodeJson){-1, "object key must be string"};
+
      case '-':  // negative
        if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
-        d = -1;
-        break;
+        if (p < e && isdigit(*p)) {
+          d = -1;
+          break;
+        } else {
+          return (struct DecodeJson){-1, "bad negative"};
+        }

      case '0':  // zero or number
        if (UNLIKELY(context == OBJECT_KEY)) goto BadObjectKey;
-        if (p < e && (*p == '.' || *p == 'e' || *p == 'E')) {
-          goto UseDubble;
+        if (p < e) {
+          if ((*p == '.' || *p == 'e' || *p == 'E')) {
+            goto UseDubble;
+          } else if (isdigit(*p)) {
+            return (struct DecodeJson){-1, "unexpected octal"};
+          }
        }
        lua_pushinteger(L, 0);
        return (struct DecodeJson){1, p};
@ -138,6 +146,7 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,

      UseDubble:  // number
        lua_pushnumber(L, StringToDouble(a, e - a, &c));
+        DCHECK(c > 0, "paranoid avoiding infinite loop");
        return (struct DecodeJson){1, a + c};

      case '[':  // Array
@ -206,134 +215,146 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,

      case '"':  // string
        luaL_buffinit(L, &b);
-        while (p < e) {
+        for (;;) {
+          if (UNLIKELY(p >= e)) {
+          UnexpectedEofString:
+            reason = "unexpected eof in string";
+            goto StringFailureWithReason;
+          }
+          c = *p++ & 255;
+          if (c == '"') {
+            luaL_pushresult(&b);
+            return (struct DecodeJson){1, p};
+          } else if (c == '\\') {
+            goto HandleEscape;
+          } else if (UNLIKELY(c <= 0x1F)) {
+            reason = "non-del c0 in string";
+            goto StringFailureWithReason;
+          } else {
+            luaL_addchar(&b, c);
+          }
+          continue;
+        HandleEscape:
+          if (UNLIKELY(p >= e)) {
+            goto UnexpectedEofString;
+          }
          switch ((c = *p++ & 255)) {
-            default:
-            AddByte:
+            case '"':
+            case '/':
+            case '\\':
              luaL_addchar(&b, c);
              break;
-            case '\\':
-              if (p < e) {
-                switch ((c = *p++ & 255)) {
-                  default:
-                    goto InvalidEscapeCharacter;
-                  case '"':
-                  case '/':
-                  case '\\':
-                    goto AddByte;
-                  case 'b':
-                    c = '\b';
-                    goto AddByte;
-                  case 'f':
-                    c = '\f';
-                    goto AddByte;
-                  case 'n':
-                    c = '\n';
-                    goto AddByte;
-                  case 'r':
-                    c = '\r';
-                    goto AddByte;
-                  case 't':
-                    c = '\t';
-                    goto AddByte;
-                  case 'x':
-                    if (p + 2 <= e &&                         //
-                        (A = kHexToInt[p[0] & 255]) != -1 &&  // HEX
-                        (B = kHexToInt[p[1] & 255]) != -1) {  //
-                      c = A << 4 | B;
-                      p += 2;
-                      goto AddByte;
-                    } else {
-                      goto InvalidEscapeCharacter;
-                    }
-                  case 'u':
-                    if (p + 4 <= e &&                         //
-                        (A = kHexToInt[p[0] & 255]) != -1 &&  //
-                        (B = kHexToInt[p[1] & 255]) != -1 &&  // UCS-2
-                        (C = kHexToInt[p[2] & 255]) != -1 &&  //
-                        (D = kHexToInt[p[3] & 255]) != -1) {  //
-                      c = A << 12 | B << 8 | C << 4 | D;
-                      if (!IsSurrogate(c)) {
-                        p += 4;
-                      } else if (IsHighSurrogate(c)) {
-                        if (p + 4 + 6 <= e &&                     //
-                            p[4] == '\\' &&                       //
-                            p[5] == 'u' &&                        //
-                            (A = kHexToInt[p[6] & 255]) != -1 &&  // UTF-16
-                            (B = kHexToInt[p[7] & 255]) != -1 &&  //
-                            (C = kHexToInt[p[8] & 255]) != -1 &&  //
-                            (D = kHexToInt[p[9] & 255]) != -1) {  //
-                          u = A << 12 | B << 8 | C << 4 | D;
-                          if (IsLowSurrogate(u)) {
-                            p += 4 + 6;
-                            c = MergeUtf16(c, u);
-                          } else {
-                            goto BadUnicode;
-                          }
-                        } else {
-                          goto BadUnicode;
-                        }
-                      } else {
-                        goto BadUnicode;
-                      }
-                      // UTF-8
-                      if (c < 0x7f) {
-                        w[0] = c;
-                        i = 1;
-                      } else if (c <= 0x7ff) {
-                        w[0] = 0300 | (c >> 6);
-                        w[1] = 0200 | (c & 077);
-                        i = 2;
-                      } else if (c <= 0xffff) {
-                        if (UNLIKELY(IsSurrogate(c))) {
-                        ReplacementCharacter:
-                          c = 0xfffd;
-                        }
-                        w[0] = 0340 | (c >> 12);
-                        w[1] = 0200 | ((c >> 6) & 077);
-                        w[2] = 0200 | (c & 077);
-                        i = 3;
-                      } else if (~(c >> 18) & 007) {
-                        w[0] = 0360 | (c >> 18);
-                        w[1] = 0200 | ((c >> 12) & 077);
-                        w[2] = 0200 | ((c >> 6) & 077);
-                        w[3] = 0200 | (c & 077);
-                        i = 4;
-                      } else {
-                        goto ReplacementCharacter;
-                      }
-                      luaL_addlstring(&b, w, i);
-                    } else {
-                      goto InvalidEscapeCharacter;
-                    BadUnicode:
-                      // Echo invalid \uXXXX sequences
-                      // Rather than corrupting UTF-8!
-                      luaL_addstring(&b, "\\u");
-                    }
-                    break;
+            case 'b':
+              luaL_addchar(&b, '\b');
+              break;
+            case 'f':
+              luaL_addchar(&b, '\f');
+              break;
+            case 'n':
+              luaL_addchar(&b, '\n');
+              break;
+            case 'r':
+              luaL_addchar(&b, '\r');
+              break;
+            case 't':
+              luaL_addchar(&b, '\t');
+              break;
+            case 'x':
+              if (p + 2 <= e &&                         //
+                  (A = kHexToInt[p[0] & 255]) != -1 &&  // HEX
+                  (B = kHexToInt[p[1] & 255]) != -1) {  //
+                c = A << 4 | B;
+                if (!(0x20 <= c && c <= 0x7E)) {
+                  reason = "hex escape not printable";
+                  goto StringFailureWithReason;
                }
+                p += 2;
+                luaL_addchar(&b, c);
+                break;
              } else {
-                goto InvalidEscapeCharacter;
+                reason = "invalid hex escape";
+                goto StringFailureWithReason;
+              }
+            case 'u':
+              if (p + 4 <= e &&                         //
+                  (A = kHexToInt[p[0] & 255]) != -1 &&  //
+                  (B = kHexToInt[p[1] & 255]) != -1 &&  // UCS-2
+                  (C = kHexToInt[p[2] & 255]) != -1 &&  //
+                  (D = kHexToInt[p[3] & 255]) != -1) {  //
+                c = A << 12 | B << 8 | C << 4 | D;
+                if (!IsSurrogate(c)) {
+                  p += 4;
+                } else if (IsHighSurrogate(c)) {
+                  if (p + 4 + 6 <= e &&                     //
+                      p[4] == '\\' &&                       //
+                      p[5] == 'u' &&                        //
+                      (A = kHexToInt[p[6] & 255]) != -1 &&  // UTF-16
+                      (B = kHexToInt[p[7] & 255]) != -1 &&  //
+                      (C = kHexToInt[p[8] & 255]) != -1 &&  //
+                      (D = kHexToInt[p[9] & 255]) != -1) {  //
+                    u = A << 12 | B << 8 | C << 4 | D;
+                    if (IsLowSurrogate(u)) {
+                      p += 4 + 6;
+                      c = MergeUtf16(c, u);
+                    } else {
+                      goto BadUnicode;
+                    }
+                  } else {
+                    goto BadUnicode;
+                  }
+                } else {
+                  goto BadUnicode;
+                }
+                // UTF-8
+                if (c < 0x7f) {
+                  w[0] = c;
+                  i = 1;
+                } else if (c <= 0x7ff) {
+                  w[0] = 0300 | (c >> 6);
+                  w[1] = 0200 | (c & 077);
+                  i = 2;
+                } else if (c <= 0xffff) {
+                  if (UNLIKELY(IsSurrogate(c))) {
+                  ReplacementCharacter:
+                    c = 0xfffd;
+                  }
+                  w[0] = 0340 | (c >> 12);
+                  w[1] = 0200 | ((c >> 6) & 077);
+                  w[2] = 0200 | (c & 077);
+                  i = 3;
+                } else if (~(c >> 18) & 007) {
+                  w[0] = 0360 | (c >> 18);
+                  w[1] = 0200 | ((c >> 12) & 077);
+                  w[2] = 0200 | ((c >> 6) & 077);
+                  w[3] = 0200 | (c & 077);
+                  i = 4;
+                } else {
+                  goto ReplacementCharacter;
+                }
+                luaL_addlstring(&b, w, i);
+              } else {
+                reason = "invalid unicode escape";
+                goto StringFailureWithReason;
+              BadUnicode:
+                // Echo invalid \uXXXX sequences
+                // Rather than corrupting UTF-8!
+                luaL_addstring(&b, "\\u");
              }
              break;
-            case '"':
-              luaL_pushresult(&b);
-              return (struct DecodeJson){1, p};
+            default:
+              reason = "invalid escape character";
+              goto StringFailureWithReason;
          }
        }
+        break;
+      StringFailureWithReason:
        luaL_pushresultsize(&b, 0);
        lua_pop(L, 1);
-        return (struct DecodeJson){-1, "unexpected eof in string"};
+        return (struct DecodeJson){-1, reason};

      default:
      IllegalCharacter:
        return (struct DecodeJson){-1, "illegal character"};
-      BadObjectKey:
-        return (struct DecodeJson){-1, "object key must be string"};
-      InvalidEscapeCharacter:
-        luaL_pushresultsize(&b, 0);
-        lua_pop(L, 1);
-        return (struct DecodeJson){-1, "invalid escape character"};
    }
  }
  if (UNLIKELY(context == TOP_LEVEL)) {
@ -357,16 +378,14 @@ static struct DecodeJson Parse(struct lua_State *L, const char *p,
 * converted to a floating-point number instead. Invalid surrogate
 * escape sequences in strings won't be decoded.
 *
- * A weird case exists when parsing empty objects. In order to let Lua
- * tell them apart from empty arrays, we insert a special key that's
- * ignored by our JSON serializer, called `[__json_object__]=true`.
- *
 * @param L is Lua interpreter state
 * @param p is input string
 * @param n is byte length of `p` or -1 for automatic strlen()
- * @return res.rc is 1 if value pushed, 0 on eof, otherwise -1
- * @return res.p is is advanced `p` pointer if `rc` isn't -1
- * @return res.p is string describing error if `rc` is -1
+ * @return r.rc is 1 if value is pushed on lua stack
+ * @return r.rc is 0 on eof
+ * @return r.rc is -1 on error
+ * @return r.p is is advanced `p` pointer if `rc ≥ 0`
+ * @return r.p is string describing error if `rc < 0`
 */
 struct DecodeJson DecodeJson(struct lua_State *L, const char *p, size_t n) {
  if (n == -1) n = p ? strlen(p) : 0;