Further improve json / lua serialization

This commit is contained in:
Justine Tunney 2022-07-09 16:27:26 -07:00
parent 9e86980191
commit ee82cee432
10 changed files with 248 additions and 155 deletions

View file

@ -58,7 +58,9 @@ int AppendStrList(struct StrList *sl) {
} }
void SortStrList(struct StrList *sl) { void SortStrList(struct StrList *sl) {
qsort(sl->p, sl->i, sizeof(*sl->p), CompareStrings); if (sl->i) {
qsort(sl->p, sl->i, sizeof(*sl->p), CompareStrings);
}
} }
int JoinStrList(struct StrList *sl, char **buf, uint64_t sep) { int JoinStrList(struct StrList *sl, char **buf, uint64_t sep) {

View file

@ -20,49 +20,79 @@ assert(EncodeLua(ParseJson[[ 2.3 ]]) == '2.3')
assert(EncodeLua(ParseJson[[ [1,3,2] ]]) == '{1, 3, 2}') assert(EncodeLua(ParseJson[[ [1,3,2] ]]) == '{1, 3, 2}')
assert(EncodeLua(ParseJson[[ {1: 2, 3: 4} ]]) == '{[1]=2, [3]=4}') assert(EncodeLua(ParseJson[[ {1: 2, 3: 4} ]]) == '{[1]=2, [3]=4}')
assert(EncodeLua(ParseJson[[ {"foo": 2, "bar": 4} ]]) == '{bar=4, foo=2}') assert(EncodeLua(ParseJson[[ {"foo": 2, "bar": 4} ]]) == '{bar=4, foo=2}')
assert(EncodeLua(ParseJson[[ null ]]) == 'nil')
assert(EncodeLua(ParseJson[[ -123 ]]) == '-123') assert(EncodeLua(ParseJson[[ -123 ]]) == '-123')
assert(EncodeLua(ParseJson[[ 1.e6 ]]) == '1000000') assert(EncodeLua(ParseJson[[ 1e6 ]]) == '1000000.')
assert(EncodeLua(ParseJson[[ 1.e-6 ]]) == '1e-06') assert(EncodeLua(ParseJson[[ 1.e-6 ]]) == '0.000001')
assert(EncodeLua(ParseJson[[ 1e-06 ]]) == '1e-06') assert(EncodeLua(ParseJson[[ 1e-06 ]]) == '0.000001')
assert(EncodeLua(ParseJson[[ 9.123e6 ]]) == '9123000') assert(EncodeLua(ParseJson[[ 9.123e6 ]]) == '9123000.')
assert(EncodeLua(ParseJson[[ [{"heh": [1,3,2]}] ]]) == '{{heh={1, 3, 2}}}') assert(EncodeLua(ParseJson[[ [{"heh": [1,3,2]}] ]]) == '{{heh={1, 3, 2}}}')
assert(EncodeLua(ParseJson[[ 3.14159 ]]) == '3.14159') assert(EncodeLua(ParseJson[[ 3.14159 ]]) == '3.14159')
assert(EncodeLua(ParseJson[[ {3=4} ]]) == '{[3]=4}') assert(EncodeLua(ParseJson[[ {3=4} ]]) == '{[3]=4}')
assert(EncodeLua(ParseJson[[ 1e-12 ]]) == '1e-12') assert(EncodeLua(ParseJson[[ 1e-12 ]]) == '1e-12')
assert(EncodeJson(ParseJson[[ 1e-12 ]]) == '1e-12')
---------------------------------------------------------------------------------------------------- assert(EncodeJson(ParseJson[[ 1e-12 ]]) == '1e-12')
-- benchmarks assert(EncodeJson(ParseJson[[ true ]]) == 'true')
assert(EncodeJson(ParseJson[[ false ]]) == 'false')
assert(EncodeJson(ParseJson[[ null ]]) == 'null')
assert(EncodeJson(ParseJson[[ [] ]]) == '[]')
assert(EncodeJson(ParseJson[[ {} ]]) == '{}')
assert(ParseJson[["\f"]] == '\f') -- c0
assert(ParseJson[["\t"]] == '\t') -- c0
assert(ParseJson[["\n"]] == '\n') -- c0
assert(ParseJson[["\r"]] == '\r') -- c0
assert(ParseJson[["\\"]] == '\\') -- c0
assert(ParseJson[["\""]] == '\"') -- c0
assert(ParseJson[["\u0100"]] == 'Ā') -- latin-1
assert(ParseJson[["\ud800\udf30\ud800\udf30"]] == '𐌰𐌰') -- utf-16 astral planes gothic
assert(ParseJson[["\uD800"]] == '\\uD800') -- utf-16 invalid (keep utf-8 well-formed)
assert(EncodeJson(ParseJson[[ -9223372036854775808 ]]) == '-9223372036854775808') -- minimum 64-bit integer
assert(EncodeJson(ParseJson[[ 9223372036854775807 ]]) == '9223372036854775807') -- maximum 64-bit integer
assert(EncodeJson(ParseJson[[ 9223372036854775808 ]]) == '9223372036854776000') -- switches to double due to integer overflow
assert(EncodeJson(ParseJson[[ -9223372036854775809 ]]) == '-9223372036854776000') -- switches to double due to integer underflow
assert(EncodeJson(ParseJson[[ 9223372036854775807.0 ]]) == '9223372036854776000') -- switches to double due to period mark
assert(EncodeJson(ParseJson[[ 2.7182818284590452354 ]]) == '2.718281828459045') -- euler constant w/ 17 digit precision
assert( EncodeLua(ParseJson[[ 2.7182818284590452354 ]]) == '2.718281828459045') -- euler constant w/ 17 digit precision
--------------------------------------------------------------------------------
-- benchmark nanos ticks
--------------------------------------------------------------------------------
-- JsonParseEmpty 23 72
-- JsonParseInteger 45 142
-- JsonParseDouble 108 335
-- JsonParseString 106 329
-- JsonParseArray 243 754
-- JsonParseObject 523 1622
function JsonParseEmpty() function JsonParseEmpty()
ParseJson[[]] ParseJson[[]]
end end
function JsonParseInt() function JsonParseInteger()
ParseJson[[ 314159 ]] ParseJson[[ -9223372036854775808 ]]
end end
function JsonParseDouble() function JsonParseDouble()
ParseJson[[ 3.14159 ]] ParseJson[[ 2.7182818284590452354 ]]
end
function JsonParseString()
ParseJson[[ "\ud800\udf30 he𐌰𐌰o \ud800\udf30" ]]
end end
function JsonParseArray() function JsonParseArray()
ParseJson[[ [3,1,4,1,5,9] ]] ParseJson[[ [123,456,789] ]]
end end
function JsonParseObject() function JsonParseObject()
ParseJson[[ {"3":"1","4":"1","5":"9"} ]] ParseJson[[ {"3":"1", "4":"1", "5":"9"} ]]
end end
print('JsonParseEmpty', Benchmark(JsonParseEmpty)) print('JsonParseEmpty', Benchmark(JsonParseEmpty))
print('JsonParseInt', Benchmark(JsonParseInt)) print('JsonParseInteg', Benchmark(JsonParseInteger))
print('JsonParseDouble', Benchmark(JsonParseDouble)) print('JsonParseDouble', Benchmark(JsonParseDouble))
print('JsonParseString', Benchmark(JsonParseString))
print('JsonParseArray', Benchmark(JsonParseArray)) print('JsonParseArray', Benchmark(JsonParseArray))
print('JsonParseObject', Benchmark(JsonParseObject)) print('JsonParseObject', Benchmark(JsonParseObject))
-- nanos ticks
-- JsonParseEmpty 24 77 85 1
-- JsonParseInt 31 96 82 1
-- JsonParseDouble 64 199 82 1
-- JsonParseArray 367 1139 80 1
-- JsonParseObject 425 1317 79 1

View file

@ -18,6 +18,7 @@
*/ */
#include "libc/str/str.h" #include "libc/str/str.h"
#include "third_party/double-conversion/double-conversion.h" #include "third_party/double-conversion/double-conversion.h"
#include "third_party/double-conversion/double-to-string.h"
#include "third_party/double-conversion/wrapper.h" #include "third_party/double-conversion/wrapper.h"
namespace double_conversion { namespace double_conversion {
@ -31,6 +32,16 @@ char* DoubleToEcmascript(char buf[128], double x) {
return b.Finalize(); return b.Finalize();
} }
char* DoubleToLua(char buf[128], double x) {
static const DoubleToStringConverter kDoubleToLua(
DoubleToStringConverter::EMIT_TRAILING_DECIMAL_POINT |
DoubleToStringConverter::NO_TRAILING_ZERO,
"math.huge", "0/0", 'e', -6, 21, 6, 0);
StringBuilder b(buf, 128);
kDoubleToLua.ToShortest(x, &b);
return b.Finalize();
}
double StringToDouble(const char* s, size_t n, int* out_processed) { double StringToDouble(const char* s, size_t n, int* out_processed) {
if (n == -1) n = strlen(s); if (n == -1) n = strlen(s);
int flags = StringToDoubleConverter::ALLOW_CASE_INSENSITIVITY | int flags = StringToDoubleConverter::ALLOW_CASE_INSENSITIVITY |

View file

@ -3,6 +3,7 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0) #if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_ COSMOPOLITAN_C_START_
char *DoubleToLua(char[128], double);
char *DoubleToEcmascript(char[128], double); char *DoubleToEcmascript(char[128], double);
double StringToDouble(const char *, size_t, int *); double StringToDouble(const char *, size_t, int *);

View file

@ -99,12 +99,17 @@ static int LuaEncodeJsonDataImpl(lua_State *L, char **buf, int level,
// json tables must be arrays or use string keys // json tables must be arrays or use string keys
goto OnError; goto OnError;
} }
RETURN_ON_ERROR(sli = AppendStrList(&sl)); // the json parser inserts a `__json_object__` into empty
RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, // objects, so we don't serialize `{}` as `[]` by mistake
numformat, -2, visited)); // and as such, we should ignore it here, for readability
RETURN_ON_ERROR(appendw(&sl.p[sli], ':')); if (strcmp(luaL_checkstring(L, -2), "__json_object__")) {
RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1, RETURN_ON_ERROR(sli = AppendStrList(&sl));
numformat, -1, visited)); RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1,
numformat, -2, visited));
RETURN_ON_ERROR(appendw(&sl.p[sli], ':'));
RETURN_ON_ERROR(LuaEncodeJsonDataImpl(L, &sl.p[sli], level - 1,
numformat, -1, visited));
}
lua_pop(L, 1); // table/-2, key/-1 lua_pop(L, 1); // table/-2, key/-1
} }
// stack: table/-1, as the key was popped by lua_next // stack: table/-1, as the key was popped by lua_next

View file

@ -25,6 +25,7 @@
#include "libc/stdio/append.internal.h" #include "libc/stdio/append.internal.h"
#include "libc/stdio/strlist.internal.h" #include "libc/stdio/strlist.internal.h"
#include "libc/x/x.h" #include "libc/x/x.h"
#include "third_party/double-conversion/wrapper.h"
#include "third_party/lua/cosmo.h" #include "third_party/lua/cosmo.h"
#include "third_party/lua/lauxlib.h" #include "third_party/lua/lauxlib.h"
#include "third_party/lua/lctype.h" #include "third_party/lua/lctype.h"
@ -126,22 +127,8 @@ static int LuaEncodeLuaDataImpl(lua_State *L, char **buf, int level,
appendd(buf, ibuf, appendd(buf, ibuf,
FormatFlex64(ibuf, luaL_checkinteger(L, idx), 2) - ibuf)); FormatFlex64(ibuf, luaL_checkinteger(L, idx), 2) - ibuf));
} else { } else {
// TODO(jart): replace this api RETURN_ON_ERROR(
while (*numformat == '%' || *numformat == '.' || appends(buf, DoubleToLua(ibuf, lua_tonumber(L, idx))));
isdigit(*numformat)) {
++numformat;
}
switch (*numformat) {
case 'a':
case 'g':
case 'f':
fmt[4] = *numformat;
break;
default:
// prevent format string hacking
goto OnError;
}
RETURN_ON_ERROR(appendf(buf, fmt, lua_tonumber(L, idx)));
} }
return 0; return 0;

View file

@ -675,7 +675,6 @@ FUNCTIONS
ParseJson(input:str) ParseJson(input:str)
├─→ value:* ├─→ value:*
├─→ true [if useoutput]
└─→ nil, error:str └─→ nil, error:str
Turns JSON string into a Lua data structure. Turns JSON string into a Lua data structure.
@ -706,8 +705,6 @@ FUNCTIONS
- useoutput: (bool=false) encodes the result directly to the - useoutput: (bool=false) encodes the result directly to the
output buffer and returns `nil` value. This option is output buffer and returns `nil` value. This option is
ignored if used outside of request handling code. ignored if used outside of request handling code.
- numformat: sets numeric format to be used, which can be 'g',
'f', or 'a' [experimental api]
This function will fail if: This function will fail if:

View file

@ -16,11 +16,11 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/errno.h" #include "libc/bits/bits.h"
#include "libc/limits.h" #include "libc/bits/likely.h"
#include "libc/math.h"
#include "libc/str/str.h"
#include "libc/str/tpenc.h" #include "libc/str/tpenc.h"
#include "libc/str/utf16.h"
#include "third_party/double-conversion/wrapper.h"
#include "third_party/lua/lauxlib.h" #include "third_party/lua/lauxlib.h"
#include "third_party/lua/lua.h" #include "third_party/lua/lua.h"
@ -29,38 +29,80 @@ struct Rc {
const char *p; const char *p;
}; };
static int Accumulate(int x, int c, int d) {
if (!__builtin_mul_overflow(x, 10, &x) &&
!__builtin_add_overflow(x, (c - '0') * d, &x)) {
return x;
} else {
errno = ERANGE;
if (d > 0) {
return INT_MAX;
} else {
return INT_MIN;
}
}
}
static struct Rc Parse(struct lua_State *L, const char *p, const char *e) { static struct Rc Parse(struct lua_State *L, const char *p, const char *e) {
uint64_t w; long x;
char w[4];
struct Rc r; struct Rc r;
const char *a;
luaL_Buffer b; luaL_Buffer b;
int A, B, C, D; int A, B, C, D, c, d, i, u;
int c, d, i, j, x, y, z; for (a = p, d = +1; p < e;) {
for (d = +1; p < e;) {
switch ((c = *p++ & 255)) { switch ((c = *p++ & 255)) {
case '-': case ' ': // spaces
case '\n':
case '\r':
case '\t':
case ',': // separators
case ':':
default:
a = p;
break;
case 'n': // null
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("null")) {
lua_pushnil(L);
return (struct Rc){1, p + 3};
}
break;
case 't': // true
if (p + 3 <= e && READ32LE(p - 1) == READ32LE("true")) {
lua_pushboolean(L, true);
return (struct Rc){1, p + 3};
}
break;
case 'f': // false
if (p + 4 <= e && READ32LE(p) == READ32LE("alse")) {
lua_pushboolean(L, false);
return (struct Rc){1, p + 4};
}
break;
case '-': // negative
d = -1; d = -1;
break; break;
case ']': case '0': // zero or number
case '}': if (p < e && *p == '.') {
return (struct Rc){0, p}; goto UseDubble;
}
lua_pushinteger(L, 0);
return (struct Rc){1, p};
case '[': case '1' ... '9': // integer
for (x = (c - '0') * d; p < e; ++p) {
c = *p & 255;
if (isdigit(c)) {
if (__builtin_mul_overflow(x, 10, &x) ||
__builtin_add_overflow(x, (c - '0') * d, &x)) {
goto UseDubble;
}
} else if (c == '.' || c == 'e' || c == 'E') {
goto UseDubble;
} else {
break;
}
}
lua_pushinteger(L, x);
return (struct Rc){1, p};
UseDubble: // number
lua_pushnumber(L, StringToDouble(a, e - a, &c));
return (struct Rc){1, a + c};
case '[': // Array
lua_newtable(L); lua_newtable(L);
i = 0; i = 0;
do { do {
@ -72,7 +114,11 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) {
} while (r.t); } while (r.t);
return (struct Rc){1, p}; return (struct Rc){1, p};
case '{': case ']':
case '}':
return (struct Rc){0, p};
case '{': // Object
lua_newtable(L); lua_newtable(L);
i = 0; i = 0;
do { do {
@ -85,16 +131,23 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) {
lua_pushnil(L); lua_pushnil(L);
} }
lua_settable(L, -3); lua_settable(L, -3);
++i;
} }
} while (r.t); } while (r.t);
if (!i) {
// we need this kludge so `{}` won't round-trip as `[]`
lua_pushstring(L, "__json_object__");
lua_pushboolean(L, true);
lua_settable(L, -3);
}
return (struct Rc){1, p}; return (struct Rc){1, p};
case '"': case '"': // string
luaL_buffinit(L, &b); luaL_buffinit(L, &b);
while (p < e) { while (p < e) {
switch ((c = *p++ & 255)) { switch ((c = *p++ & 255)) {
default: default:
AddChar: AddByte:
luaL_addchar(&b, c); luaL_addchar(&b, c);
break; break;
case '\\': case '\\':
@ -104,38 +157,86 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) {
case '/': case '/':
case '\\': case '\\':
default: default:
goto AddChar; goto AddByte;
case 'b': case 'b':
c = '\b'; c = '\b';
goto AddChar; goto AddByte;
case 'f': case 'f':
c = '\f'; c = '\f';
goto AddChar; goto AddByte;
case 'n': case 'n':
c = '\n'; c = '\n';
goto AddChar; goto AddByte;
case 'r': case 'r':
c = '\r'; c = '\r';
goto AddChar; goto AddByte;
case 't': case 't':
c = '\t'; c = '\t';
goto AddChar; goto AddByte;
case 'u': case 'u':
if (p + 4 <= e && // if (p + 4 <= e && //
(A = kHexToInt[p[0] & 255]) != -1 && // (A = kHexToInt[p[0] & 255]) != -1 && //
(B = kHexToInt[p[1] & 255]) != -1 && // (B = kHexToInt[p[1] & 255]) != -1 && // UCS-2
(C = kHexToInt[p[2] & 255]) != -1 && // (C = kHexToInt[p[2] & 255]) != -1 && //
(D = kHexToInt[p[3] & 255]) != -1) { (D = kHexToInt[p[3] & 255]) != -1) { //
p += 4;
c = A << 12 | B << 8 | C << 4 | D; c = A << 12 | B << 8 | C << 4 | D;
w = tpenc(c); if (!IsSurrogate(c)) {
do { p += 4;
luaL_addchar(&b, w & 255); } else if (IsHighSurrogate(c)) {
} while ((w >>= 8)); if (p + 4 + 6 <= e && //
break; p[4] == '\\' && //
p[5] == 'u' && //
(A = kHexToInt[p[6] & 255]) != -1 && // UTF-16
(B = kHexToInt[p[7] & 255]) != -1 && //
(C = kHexToInt[p[8] & 255]) != -1 && //
(D = kHexToInt[p[9] & 255]) != -1) { //
u = A << 12 | B << 8 | C << 4 | D;
if (IsLowSurrogate(u)) {
p += 4 + 6;
c = MergeUtf16(c, u);
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
} else {
goto BadUnicode;
}
// UTF-8
if (c < 0x7f) {
w[0] = c;
i = 1;
} else if (c <= 0x7ff) {
w[0] = 0300 | (c >> 6);
w[1] = 0200 | (c & 077);
i = 2;
} else if (c <= 0xffff) {
if (UNLIKELY(IsSurrogate(c))) {
ReplacementCharacter:
c = 0xfffd;
}
w[0] = 0340 | (c >> 12);
w[1] = 0200 | ((c >> 6) & 077);
w[2] = 0200 | (c & 077);
i = 3;
} else if (~(c >> 18) & 007) {
w[0] = 0360 | (c >> 18);
w[1] = 0200 | ((c >> 12) & 077);
w[2] = 0200 | ((c >> 6) & 077);
w[3] = 0200 | (c & 077);
i = 4;
} else {
goto ReplacementCharacter;
}
luaL_addlstring(&b, w, i);
} else { } else {
goto AddChar; BadUnicode:
// Echo invalid \uXXXX sequences
// Rather than corrupting UTF-8!
luaL_addstring(&b, "\\u");
} }
break;
} }
} }
break; break;
@ -146,75 +247,33 @@ static struct Rc Parse(struct lua_State *L, const char *p, const char *e) {
FinishString: FinishString:
luaL_pushresult(&b); luaL_pushresult(&b);
return (struct Rc){1, p}; return (struct Rc){1, p};
case '0' ... '9':
for (x = (c - '0') * d; p < e; ++p) {
c = *p & 255;
if (isdigit(c)) {
x = Accumulate(x, c, d);
} else if (c == '.') {
++p;
goto Fraction;
} else if (c == 'e' || c == 'E') {
++p;
j = 0;
y = 0;
goto Exponent;
} else {
break;
}
}
lua_pushinteger(L, x);
return (struct Rc){1, p};
Fraction:
for (j = y = 0; p < e; ++p) {
c = *p & 255;
if (isdigit(c)) {
--j;
y = Accumulate(y, c, d);
} else if (c == 'e' || c == 'E') {
++p;
goto Exponent;
} else {
break;
}
}
lua_pushnumber(L, x + y * exp10(j));
return (struct Rc){1, p};
Exponent:
d = +1;
for (z = 0; p < e; ++p) {
c = *p & 255;
if (isdigit(c)) {
z = Accumulate(z, c, d);
} else if (c == '-') {
d = -1;
} else if (c == '+') {
d = +1;
} else {
break;
}
}
lua_pushnumber(L, (x + y * exp10(j)) * exp10(z));
return (struct Rc){1, p};
case ',':
case ':':
case ' ':
case '\n':
case '\r':
case '\t':
default:
break;
} }
} }
return (struct Rc){0, p}; return (struct Rc){0, p};
} }
/** /**
* Parses JSON data structure string into a Lua data structure. * Parses JSON data structure string into Lua data structure.
*
* This function returns the number of items pushed to the Lua stack,
* which should be 1, unless no parseable JSON content was found, in
* which case this will return 0. On error -1 is returned. There's
* currently no error return condition. This function doesn't do JSON
* validity enforcement.
*
* JSON UTF-16 strings are re-encoded as valid UTF-8. 64-bit integers
* are supported. If an integer overflows during parsing, it'll be
* converted to a floating-point number instead. Invalid surrogate
* escape sequences in strings won't be decoded.
*
* A weird case exists when parsing empty objects. In order to let Lua
* tell them apart from empty arrays, we insert a special key that's
* ignored by our JSON serializer, called `[__json_object__]=true`.
*
* @param L is Lua interpreter state
* @param p is input string
* @param n is byte length of `p` or -1 for automatic strlen()
* @return 1 if value was pushed, 0 on end, or -1 on error
*/ */
int ParseJson(struct lua_State *L, const char *p, size_t n) { int ParseJson(struct lua_State *L, const char *p, size_t n) {
if (n == -1) n = p ? strlen(p) : 0; if (n == -1) n = p ? strlen(p) : 0;

View file

@ -68,7 +68,8 @@ TOOL_NET_DIRECTDEPS = \
THIRD_PARTY_ZLIB \ THIRD_PARTY_ZLIB \
TOOL_ARGS \ TOOL_ARGS \
TOOL_BUILD_LIB \ TOOL_BUILD_LIB \
TOOL_DECODE_LIB TOOL_DECODE_LIB \
THIRD_PARTY_DOUBLECONVERSION
TOOL_NET_DEPS := \ TOOL_NET_DEPS := \
$(call uniq,$(foreach x,$(TOOL_NET_DIRECTDEPS),$($(x)))) $(call uniq,$(foreach x,$(TOOL_NET_DIRECTDEPS),$($(x))))