add unicode escapes

This commit is contained in:
Evan Jones 2023-07-10 23:26:09 -04:00
parent 38fbd4001e
commit 014fbfd4a9
2 changed files with 43 additions and 33 deletions

View file

@ -50,15 +50,27 @@ namespace grammar_parser {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
} }
int hex_to_int(char c) { std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
if ('a' <= c && c <= 'f') { const char * pos = src;
return c - 'a' + 10; const char * end = src + size;
} else if ('A' <= c && c <= 'F') { uint32_t value = 0;
return c - 'A' + 10; for ( ; pos < end && *pos; pos++) {
} else if ('0' <= c && c <= '9') { value <<= 4;
return c - '0'; char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
} }
return -1; if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
} }
const char * parse_space(const char * src, bool newline_ok) { const char * parse_space(const char * src, bool newline_ok) {
@ -89,30 +101,23 @@ namespace grammar_parser {
std::pair<uint32_t, const char *> parse_char(const char * src) { std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') { if (*src == '\\') {
char esc = src[1]; switch (src[1]) {
// TODO: 16- and 32-bit escapes case 'x': return parse_hex(src + 2, 2);
if (esc == 'x') { case 'u': return parse_hex(src + 2, 4);
int first = hex_to_int(src[2]); case 'U': return parse_hex(src + 2, 8);
if (first > -1) { case 't': return std::make_pair('\t', src + 2);
int second = hex_to_int(src[3]); case 'r': return std::make_pair('\r', src + 2);
if (second > -1) { case 'n': return std::make_pair('\n', src + 2);
return std::make_pair((first << 4) + second, src + 4); case '\\':
} case '"':
} case '[':
throw std::runtime_error(std::string("expecting \\xNN at ") + src); case ']':
} else if (esc == '"' || esc == '[' || esc == ']') { return std::make_pair(src[1], src + 2);
return std::make_pair(esc, src + 2); default:
} else if (esc == 'r') { throw std::runtime_error(std::string("unknown escape at ") + src);
return std::make_pair('\r', src + 2);
} else if (esc == 'n') {
return std::make_pair('\n', src + 2);
} else if (esc == 't') {
return std::make_pair('\t', src + 2);
} }
throw std::runtime_error(std::string("unknown escape at ") + src);
} else if (*src) { } else if (*src) {
auto decoded = decode_utf8(src); return decode_utf8(src);
return std::make_pair(decoded.first, decoded.second);
} }
throw std::runtime_error("unexpected end of input"); throw std::runtime_error("unexpected end of input");
} }

View file

@ -15,10 +15,15 @@ array ::=
("," ws value)* ("," ws value)*
)? "]" )? "]"
# Subset of JSON primitives: strings without escapes and only regular integers string ::=
string ::= "\"" [ \t!#-\[\]-~]* "\"" ws "\"" (
[\x20\x21\x23-\x5b\x5d-\U0010FFFF] | # any code point except " (\x22) and \ (\x5c)
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\""
# Only plain integers currently
number ::= "-"? [0-9]+ ws number ::= "-"? [0-9]+ ws
boolean ::= ("true" | "false") ws boolean ::= ("true" | "false") ws
# Optional space: by convention, applied in this grammar after literal chars when allowed # Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= [ \t\n] ws | ws ::= ([ \t\n] ws)?