From ee82cee43296d9fb9aa388dfb52532f889a469e9 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 9 Jul 2022 16:27:26 -0700 Subject: [PATCH] Further improve json / lua serialization --- libc/stdio/strlist.c | 4 +- test/tool/net/ljson_test.c | 0 test/tool/net/ljson_test.lua | 70 ++++-- third_party/double-conversion/wrapper.cc | 11 + third_party/double-conversion/wrapper.h | 1 + third_party/lua/luaencodejsondata.c | 17 +- third_party/lua/luaencodeluadata.c | 19 +- tool/net/help.txt | 3 - tool/net/ljson.c | 275 ++++++++++++++--------- tool/net/net.mk | 3 +- 10 files changed, 248 insertions(+), 155 deletions(-) delete mode 100644 test/tool/net/ljson_test.c diff --git a/libc/stdio/strlist.c b/libc/stdio/strlist.c index 812b164d2..d3c596087 100644 --- a/libc/stdio/strlist.c +++ b/libc/stdio/strlist.c @@ -58,7 +58,9 @@ int AppendStrList(struct StrList *sl) { } void SortStrList(struct StrList *sl) { - qsort(sl->p, sl->i, sizeof(*sl->p), CompareStrings); + if (sl->i) { + qsort(sl->p, sl->i, sizeof(*sl->p), CompareStrings); + } } int JoinStrList(struct StrList *sl, char **buf, uint64_t sep) { diff --git a/test/tool/net/ljson_test.c b/test/tool/net/ljson_test.c deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/tool/net/ljson_test.lua b/test/tool/net/ljson_test.lua index e7de68024..7cbf8eba0 100644 --- a/test/tool/net/ljson_test.lua +++ b/test/tool/net/ljson_test.lua @@ -20,49 +20,79 @@ assert(EncodeLua(ParseJson[[ 2.3 ]]) == '2.3') assert(EncodeLua(ParseJson[[ [1,3,2] ]]) == '{1, 3, 2}') assert(EncodeLua(ParseJson[[ {1: 2, 3: 4} ]]) == '{[1]=2, [3]=4}') assert(EncodeLua(ParseJson[[ {"foo": 2, "bar": 4} ]]) == '{bar=4, foo=2}') +assert(EncodeLua(ParseJson[[ null ]]) == 'nil') assert(EncodeLua(ParseJson[[ -123 ]]) == '-123') -assert(EncodeLua(ParseJson[[ 1.e6 ]]) == '1000000') -assert(EncodeLua(ParseJson[[ 1.e-6 ]]) == '1e-06') -assert(EncodeLua(ParseJson[[ 1e-06 ]]) == '1e-06') -assert(EncodeLua(ParseJson[[ 9.123e6 ]]) == '9123000') +assert(EncodeLua(ParseJson[[ 1e6 ]]) == '1000000.') +assert(EncodeLua(ParseJson[[ 1.e-6 ]]) == '0.000001') +assert(EncodeLua(ParseJson[[ 1e-06 ]]) == '0.000001') +assert(EncodeLua(ParseJson[[ 9.123e6 ]]) == '9123000.') assert(EncodeLua(ParseJson[[ [{"heh": [1,3,2]}] ]]) == '{{heh={1, 3, 2}}}') assert(EncodeLua(ParseJson[[ 3.14159 ]]) == '3.14159') assert(EncodeLua(ParseJson[[ {3=4} ]]) == '{[3]=4}') assert(EncodeLua(ParseJson[[ 1e-12 ]]) == '1e-12') -assert(EncodeJson(ParseJson[[ 1e-12 ]]) == '1e-12') ----------------------------------------------------------------------------------------------------- --- benchmarks +assert(EncodeJson(ParseJson[[ 1e-12 ]]) == '1e-12') +assert(EncodeJson(ParseJson[[ true ]]) == 'true') +assert(EncodeJson(ParseJson[[ false ]]) == 'false') +assert(EncodeJson(ParseJson[[ null ]]) == 'null') +assert(EncodeJson(ParseJson[[ [] ]]) == '[]') +assert(EncodeJson(ParseJson[[ {} ]]) == '{}') + +assert(ParseJson[["\f"]] == '\f') -- c0 +assert(ParseJson[["\t"]] == '\t') -- c0 +assert(ParseJson[["\n"]] == '\n') -- c0 +assert(ParseJson[["\r"]] == '\r') -- c0 +assert(ParseJson[["\\"]] == '\\') -- c0 +assert(ParseJson[["\""]] == '\"') -- c0 +assert(ParseJson[["\u0100"]] == 'Δ€') -- latin-1 +assert(ParseJson[["\ud800\udf30\ud800\udf30"]] == '𐌰𐌰') -- utf-16 astral planes gothic +assert(ParseJson[["\uD800"]] == '\\uD800') -- utf-16 invalid (keep utf-8 well-formed) + +assert(EncodeJson(ParseJson[[ -9223372036854775808 ]]) == '-9223372036854775808') -- minimum 64-bit integer +assert(EncodeJson(ParseJson[[ 9223372036854775807 ]]) == '9223372036854775807') -- maximum 64-bit integer +assert(EncodeJson(ParseJson[[ 9223372036854775808 ]]) == '9223372036854776000') -- switches to double due to integer overflow +assert(EncodeJson(ParseJson[[ -9223372036854775809 ]]) == '-9223372036854776000') -- switches to double due to integer underflow +assert(EncodeJson(ParseJson[[ 9223372036854775807.0 ]]) == '9223372036854776000') -- switches to double due to period mark +assert(EncodeJson(ParseJson[[ 2.7182818284590452354 ]]) == '2.718281828459045') -- euler constant w/ 17 digit precision +assert( EncodeLua(ParseJson[[ 2.7182818284590452354 ]]) == '2.718281828459045') -- euler constant w/ 17 digit precision + +-------------------------------------------------------------------------------- +-- benchmark nanos ticks +-------------------------------------------------------------------------------- +-- JsonParseEmpty 23 72 +-- JsonParseInteger 45 142 +-- JsonParseDouble 108 335 +-- JsonParseString 106 329 +-- JsonParseArray 243 754 +-- JsonParseObject 523 1622 function JsonParseEmpty() ParseJson[[]] end -function JsonParseInt() - ParseJson[[ 314159 ]] +function JsonParseInteger() + ParseJson[[ -9223372036854775808 ]] end function JsonParseDouble() - ParseJson[[ 3.14159 ]] + ParseJson[[ 2.7182818284590452354 ]] +end + +function JsonParseString() + ParseJson[[ "\ud800\udf30 he𐌰𐌰o \ud800\udf30" ]] end function JsonParseArray() - ParseJson[[ [3,1,4,1,5,9] ]] + ParseJson[[ [123,456,789] ]] end function JsonParseObject() - ParseJson[[ {"3":"1","4":"1","5":"9"} ]] + ParseJson[[ {"3":"1", "4":"1", "5":"9"} ]] end print('JsonParseEmpty', Benchmark(JsonParseEmpty)) -print('JsonParseInt', Benchmark(JsonParseInt)) +print('JsonParseInteg', Benchmark(JsonParseInteger)) print('JsonParseDouble', Benchmark(JsonParseDouble)) +print('JsonParseString', Benchmark(JsonParseString)) print('JsonParseArray', Benchmark(JsonParseArray)) print('JsonParseObject', Benchmark(JsonParseObject)) - --- nanos ticks --- JsonParseEmpty 24 77 85 1 --- JsonParseInt 31 96 82 1 --- JsonParseDouble 64 199 82 1 --- JsonParseArray 367 1139 80 1 --- JsonParseObject 425 1317 79 1 diff --git a/third_party/double-conversion/wrapper.cc b/third_party/double-conversion/wrapper.cc index cad1df20a..d602ada44 100644 --- a/third_party/double-conversion/wrapper.cc +++ b/third_party/double-conversion/wrapper.cc @@ -18,6 +18,7 @@ β•šβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€*/ #include "libc/str/str.h" #include "third_party/double-conversion/double-conversion.h" +#include "third_party/double-conversion/double-to-string.h" #include "third_party/double-conversion/wrapper.h" namespace double_conversion { @@ -31,6 +32,16 @@ char* DoubleToEcmascript(char buf[128], double x) { return b.Finalize(); } +char* DoubleToLua(char buf[128], double x) { + static const DoubleToStringConverter kDoubleToLua( + DoubleToStringConverter::EMIT_TRAILING_DECIMAL_POINT | + DoubleToStringConverter::NO_TRAILING_ZERO, + "math.huge", "0/0", 'e', -6, 21, 6, 0); + StringBuilder b(buf, 128); + kDoubleToLua.ToShortest(x, &b); + return b.Finalize(); +} + double StringToDouble(const char* s, size_t n, int* out_processed) { if (n == -1) n = strlen(s); int flags = StringToDoubleConverter::ALLOW_CASE_INSENSITIVITY | diff --git a/third_party/double-conversion/wrapper.h b/third_party/double-conversion/wrapper.h index cbf8f14d2..0414345d4 100644 --- a/third_party/double-conversion/wrapper.h +++ b/third_party/double-conversion/wrapper.h @@ -3,6 +3,7 @@ #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ +char *DoubleToLua(char[128], double); char *DoubleToEcmascript(char[128], double); double StringToDouble(const char *, size_t, int *); diff --git a/third_party/lua/luaencodejsondata.c b/third_party/lua/luaencodejsondata.c index bb08fe303..ff0c5ab97 100644 --- a/third_party/lua/luaencodejsondata.c +++ b/third_party/lua/luaencodejsondata.c @@ -99,12 +99,17 @@ static int LuaEncodeJsonDataImpl(lua_State *L, char **buf, int level, // json tables must be arrays or use string keys goto OnError; } - RETURN_ON_ERROR(sli = AppendStrList(&sl)); - RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, - numformat, -2, visited)); - RETURN_ON_ERROR(appendw(&sl.p[sli], ':')); - RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, - numformat, -1, visited)); + // the json parser inserts a `__json_object__` into empty + // objects, so we don't serialize `{}` as `[]` by mistake + // and as such, we should ignore it here, for readability + if (strcmp(luaL_checkstring(L, -2), "__json_object__")) { + RETURN_ON_ERROR(sli = AppendStrList(&sl)); + RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, + numformat, -2, visited)); + RETURN_ON_ERROR(appendw(&sl.p[sli], ':')); + RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, + numformat, -1, visited)); + } lua_pop(L, 1); // table/-2, key/-1 } // stack: table/-1, as the key was popped by lua_next diff --git a/third_party/lua/luaencodeluadata.c b/third_party/lua/luaencodeluadata.c index 877975c9f..be8507df6 100644 --- a/third_party/lua/luaencodeluadata.c +++ b/third_party/lua/luaencodeluadata.c @@ -25,6 +25,7 @@ #include "libc/stdio/append.internal.h" #include "libc/stdio/strlist.internal.h" #include "libc/x/x.h" +#include "third_party/double-conversion/wrapper.h" #include "third_party/lua/cosmo.h" #include "third_party/lua/lauxlib.h" #include "third_party/lua/lctype.h" @@ -126,22 +127,8 @@ static int LuaEncodeLuaDataImpl(lua_State *L, char **buf, int level, appendd(buf, ibuf, FormatFlex64(ibuf, luaL_checkinteger(L, idx), 2) - ibuf)); } else { - // TODO(jart): replace this api - while (*numformat == '%' || *numformat == '.' || - isdigit(*numformat)) { - ++numformat; - } - switch (*numformat) { - case 'a': - case 'g': - case 'f': - fmt[4] = *numformat; - break; - default: - // prevent format string hacking - goto OnError; - } - RETURN_ON_ERROR(appendf(buf, fmt, lua_tonumber(L, idx))); + RETURN_ON_ERROR( + appends(buf, DoubleToLua(ibuf, lua_tonumber(L, idx)))); } return 0; diff --git a/tool/net/help.txt b/tool/net/help.txt index 820f44e3a..c74a3e146 100644 --- a/tool/net/help.txt +++ b/tool/net/help.txt @@ -675,7 +675,6 @@ FUNCTIONS ParseJson(input:str) β”œβ”€β†’ value:* - β”œβ”€β†’ true [if useoutput] └─→ nil, error:str Turns JSON string into a Lua data structure. @@ -706,8 +705,6 @@ FUNCTIONS - useoutput: (bool=false) encodes the result directly to the output buffer and returns `nil` value. This option is ignored if used outside of request handling code. - - numformat: sets numeric format to be used, which can be 'g', - 'f', or 'a' [experimental api] This function will fail if: diff --git a/tool/net/ljson.c b/tool/net/ljson.c index 731910468..6fec74c8d 100644 --- a/tool/net/ljson.c +++ b/tool/net/ljson.c @@ -16,11 +16,11 @@ β”‚ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR β”‚ β”‚ PERFORMANCE OF THIS SOFTWARE. β”‚ β•šβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€*/ -#include "libc/errno.h" -#include "libc/limits.h" -#include "libc/math.h" -#include "libc/str/str.h" +#include "libc/bits/bits.h" +#include "libc/bits/likely.h" #include "libc/str/tpenc.h" +#include "libc/str/utf16.h" +#include "third_party/double-conversion/wrapper.h" #include "third_party/lua/lauxlib.h" #include "third_party/lua/lua.h" @@ -29,38 +29,80 @@ struct Rc { const char *p; }; -static int Accumulate(int x, int c, int d) { - if (!__builtin_mul_overflow(x, 10, &x) && - !__builtin_add_overflow(x, (c - '0') * d, &x)) { - return x; - } else { - errno = ERANGE; - if (d > 0) { - return INT_MAX; - } else { - return INT_MIN; - } - } -} - static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { - uint64_t w; + long x; + char w[4]; struct Rc r; + const char *a; luaL_Buffer b; - int A, B, C, D; - int c, d, i, j, x, y, z; - for (d = +1; p < e;) { + int A, B, C, D, c, d, i, u; + for (a = p, d = +1; p < e;) { switch ((c = *p++ & 255)) { - case '-': + case ' ': // spaces + case '\n': + case '\r': + case '\t': + case ',': // separators + case ':': + default: + a = p; + break; + + case 'n': // null + if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) { + lua_pushnil(L); + return (struct Rc){1, p + 3}; + } + break; + + case 't': // true + if (p + 3 <= e && READ32LE(p - 1) == READ32LE("true")) { + lua_pushboolean(L, true); + return (struct Rc){1, p + 3}; + } + break; + + case 'f': // false + if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) { + lua_pushboolean(L, false); + return (struct Rc){1, p + 4}; + } + break; + + case '-': // negative d = -1; break; - case ']': - case '}': - return (struct Rc){0, p}; + case '0': // zero or number + if (p < e && *p == '.') { + goto UseDubble; + } + lua_pushinteger(L, 0); + return (struct Rc){1, p}; - case '[': + case '1' ... '9': // integer + for (x = (c - '0') * d; p < e; ++p) { + c = *p & 255; + if (isdigit(c)) { + if (__builtin_mul_overflow(x, 10, &x) || + __builtin_add_overflow(x, (c - '0') * d, &x)) { + goto UseDubble; + } + } else if (c == '.' || c == 'e' || c == 'E') { + goto UseDubble; + } else { + break; + } + } + lua_pushinteger(L, x); + return (struct Rc){1, p}; + + UseDubble: // number + lua_pushnumber(L, StringToDouble(a, e - a, &c)); + return (struct Rc){1, a + c}; + + case '[': // Array lua_newtable(L); i = 0; do { @@ -72,7 +114,11 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { } while (r.t); return (struct Rc){1, p}; - case '{': + case ']': + case '}': + return (struct Rc){0, p}; + + case '{': // Object lua_newtable(L); i = 0; do { @@ -85,16 +131,23 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { lua_pushnil(L); } lua_settable(L, -3); + ++i; } } while (r.t); + if (!i) { + // we need this kludge so `{}` won't round-trip as `[]` + lua_pushstring(L, "__json_object__"); + lua_pushboolean(L, true); + lua_settable(L, -3); + } return (struct Rc){1, p}; - case '"': + case '"': // string luaL_buffinit(L, &b); while (p < e) { switch ((c = *p++ & 255)) { default: - AddChar: + AddByte: luaL_addchar(&b, c); break; case '\\': @@ -104,38 +157,86 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { case '/': case '\\': default: - goto AddChar; + goto AddByte; case 'b': c = '\b'; - goto AddChar; + goto AddByte; case 'f': c = '\f'; - goto AddChar; + goto AddByte; case 'n': c = '\n'; - goto AddChar; + goto AddByte; case 'r': c = '\r'; - goto AddChar; + goto AddByte; case 't': c = '\t'; - goto AddChar; + goto AddByte; case 'u': if (p + 4 <= e && // (A = kHexToInt[p[0] & 255]) != -1 && // - (B = kHexToInt[p[1] & 255]) != -1 && // + (B = kHexToInt[p[1] & 255]) != -1 && // UCS-2 (C = kHexToInt[p[2] & 255]) != -1 && // - (D = kHexToInt[p[3] & 255]) != -1) { - p += 4; + (D = kHexToInt[p[3] & 255]) != -1) { // c = A << 12 | B << 8 | C << 4 | D; - w = tpenc(c); - do { - luaL_addchar(&b, w & 255); - } while ((w >>= 8)); - break; + if (!IsSurrogate(c)) { + p += 4; + } else if (IsHighSurrogate(c)) { + if (p + 4 + 6 <= e && // + p[4] == '\\' && // + p[5] == 'u' && // + (A = kHexToInt[p[6] & 255]) != -1 && // UTF-16 + (B = kHexToInt[p[7] & 255]) != -1 && // + (C = kHexToInt[p[8] & 255]) != -1 && // + (D = kHexToInt[p[9] & 255]) != -1) { // + u = A << 12 | B << 8 | C << 4 | D; + if (IsLowSurrogate(u)) { + p += 4 + 6; + c = MergeUtf16(c, u); + } else { + goto BadUnicode; + } + } else { + goto BadUnicode; + } + } else { + goto BadUnicode; + } + // UTF-8 + if (c < 0x7f) { + w[0] = c; + i = 1; + } else if (c <= 0x7ff) { + w[0] = 0300 | (c >> 6); + w[1] = 0200 | (c & 077); + i = 2; + } else if (c <= 0xffff) { + if (UNLIKELY(IsSurrogate(c))) { + ReplacementCharacter: + c = 0xfffd; + } + w[0] = 0340 | (c >> 12); + w[1] = 0200 | ((c >> 6) & 077); + w[2] = 0200 | (c & 077); + i = 3; + } else if (~(c >> 18) & 007) { + w[0] = 0360 | (c >> 18); + w[1] = 0200 | ((c >> 12) & 077); + w[2] = 0200 | ((c >> 6) & 077); + w[3] = 0200 | (c & 077); + i = 4; + } else { + goto ReplacementCharacter; + } + luaL_addlstring(&b, w, i); } else { - goto AddChar; + BadUnicode: + // Echo invalid \uXXXX sequences + // Rather than corrupting UTF-8! + luaL_addstring(&b, "\\u"); } + break; } } break; @@ -146,75 +247,33 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { FinishString: luaL_pushresult(&b); return (struct Rc){1, p}; - - case '0' ... '9': - for (x = (c - '0') * d; p < e; ++p) { - c = *p & 255; - if (isdigit(c)) { - x = Accumulate(x, c, d); - } else if (c == '.') { - ++p; - goto Fraction; - } else if (c == 'e' || c == 'E') { - ++p; - j = 0; - y = 0; - goto Exponent; - } else { - break; - } - } - lua_pushinteger(L, x); - return (struct Rc){1, p}; - - Fraction: - for (j = y = 0; p < e; ++p) { - c = *p & 255; - if (isdigit(c)) { - --j; - y = Accumulate(y, c, d); - } else if (c == 'e' || c == 'E') { - ++p; - goto Exponent; - } else { - break; - } - } - lua_pushnumber(L, x + y * exp10(j)); - return (struct Rc){1, p}; - - Exponent: - d = +1; - for (z = 0; p < e; ++p) { - c = *p & 255; - if (isdigit(c)) { - z = Accumulate(z, c, d); - } else if (c == '-') { - d = -1; - } else if (c == '+') { - d = +1; - } else { - break; - } - } - lua_pushnumber(L, (x + y * exp10(j)) * exp10(z)); - return (struct Rc){1, p}; - - case ',': - case ':': - case ' ': - case '\n': - case '\r': - case '\t': - default: - break; } } return (struct Rc){0, p}; } /** - * Parses JSON data structure string into a Lua data structure. + * Parses JSON data structure string into Lua data structure. + * + * This function returns the number of items pushed to the Lua stack, + * which should be 1, unless no parseable JSON content was found, in + * which case this will return 0. On error -1 is returned. There's + * currently no error return condition. This function doesn't do JSON + * validity enforcement. + * + * JSON UTF-16 strings are re-encoded as valid UTF-8. 64-bit integers + * are supported. If an integer overflows during parsing, it'll be + * converted to a floating-point number instead. Invalid surrogate + * escape sequences in strings won't be decoded. + * + * A weird case exists when parsing empty objects. In order to let Lua + * tell them apart from empty arrays, we insert a special key that's + * ignored by our JSON serializer, called `[__json_object__]=true`. + * + * @param L is Lua interpreter state + * @param p is input string + * @param n is byte length of `p` or -1 for automatic strlen() + * @return 1 if value was pushed, 0 on end, or -1 on error */ int ParseJson(struct lua_State *L, const char *p, size_t n) { if (n == -1) n = p ? strlen(p) : 0; diff --git a/tool/net/net.mk b/tool/net/net.mk index 20821b511..f6a450d36 100644 --- a/tool/net/net.mk +++ b/tool/net/net.mk @@ -68,7 +68,8 @@ TOOL_NET_DIRECTDEPS = \ THIRD_PARTY_ZLIB \ TOOL_ARGS \ TOOL_BUILD_LIB \ - TOOL_DECODE_LIB + TOOL_DECODE_LIB \ + THIRD_PARTY_DOUBLECONVERSION TOOL_NET_DEPS := \ $(call uniq,$(foreach x,$(TOOL_NET_DIRECTDEPS),$($(x))))