json: fix string patterns (was missing quotes)

2024-03-18 04:06:23 +00:00 · 2024-03-18 04:06:23 +00:00 · 24f0b941cf
commit 24f0b941cf
parent dd922a4da3
7 changed files with 1579 additions and 1500 deletions
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -44,16 +44,17 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('{*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
 DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
 TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
 class SchemaConverter:
-    def __init__(self, *, prop_order, allow_fetch, dotall):
+    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
        self._prop_order = prop_order
        self._allow_fetch = allow_fetch
        self._dotall = dotall
        self._raw_pattern = raw_pattern
        self._rules = {'space': SPACE_RULE}
        self._refs = {}
        self._refs_being_resolved = set()
@ -152,6 +153,10 @@ class SchemaConverter:
        i = 0
        length = len(pattern)
        def to_rule(s: Tuple[str, bool]) -> str:
            (txt, is_literal) = s
            return "\"" + txt + "\"" if is_literal else txt
        def transform() -> Tuple[str, bool]:
            '''
                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
@ -180,13 +185,12 @@ class SchemaConverter:
                ret = []
                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
                    if is_literal:
-                        lit = ''.join(x[0][1:-1] for x in g)
+                        ret.append((''.join(x[0] for x in g), True))
                        ret.append((f'"{lit}"', True))
                    else:
                        ret.extend(g)
                if len(ret) == 1:
                    return ret[0]
-                return (' '.join(x[0] for x in seq), False)
+                return (' '.join(to_rule(x) for x in seq), False)
            while i < length:
                c = pattern[i]
@ -197,7 +201,7 @@ class SchemaConverter:
                    i += 1
                    if i < length:
                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
-                    seq.append((f'({transform()[0]})', False))
+                    seq.append((f'({to_rule(transform())})', False))
                elif c == ')':
                    i += 1
                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
@ -220,7 +224,7 @@ class SchemaConverter:
                    seq.append(('|', False))
                    i += 1
                elif c in ('*', '+', '?'):
-                    seq[-1] = (f'{seq[-1][0]}{c}', False)
+                    seq[-1] = (to_rule(seq[-1]) + c, False)
                    i += 1
                elif c == '{':
                    curly_brackets = c
@ -232,13 +236,18 @@ class SchemaConverter:
                    curly_brackets += '}'
                    i += 1
                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
-                    if len(nums) == 1:
+                    min_times = 0
-                        min_times = int(nums[0])
+                    max_times = None
-                        max_times = min_times
+                    try:
-                    else:
+                        if len(nums) == 1:
-                        assert len(nums) == 2
+                            min_times = int(nums[0])
-                        min_times = int(nums[0]) if nums[0] else 0
+                            max_times = min_times
-                        max_times = int(nums[1]) if nums[1] else None
+                        else:
                            assert len(nums) == 2
                            min_times = int(nums[0]) if nums[0] else 0
                            max_times = int(nums[1]) if nums[1] else None
                    except ValueError:
                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
                    (sub, sub_is_literal) = seq[-1]
@ -263,32 +272,35 @@ class SchemaConverter:
                            False
                        )
                else:
-                    lit = ''
+                    literal = ''
-                    while i < length and pattern[i] not in NON_LITERAL_SET \
+                    while i < length:
                            and not (i < length - 1 and pattern[i+1] in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS):
                        if pattern[i] == '\\' and i < length - 1:
-                            i += 1
+                            next = pattern[i + 1]
-                            if pattern[i] in NON_LITERAL_SET:
+                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
-                                # Escapes in regular expressions that aren't escaped in GBNF literals
+                                i += 1
-                                lit += pattern[i]
+                                literal += pattern[i]
                                i += 1
                            else:
-                                lit += f'\\{pattern[i]}'
+                                literal += pattern[i:i+2]
                                i += 2
                        elif pattern[i] == '"' and not self._raw_pattern:
                            literal += '\\"'
                            i += 1
                        elif pattern[i] not in NON_LITERAL_SET and \
                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
                            literal += pattern[i]
                            i += 1
                        else:
-                            if pattern[i] == '"':
+                            break
-                                lit += '\\'
+                    if literal:
-                            lit += pattern[i]
+                        seq.append((literal, True))
                            i += 1
                    if lit:
                        seq.append((f'"{lit}"', True))
                    if i < length and pattern[i] not in ('.', '(', ')', '|', '[', '{', '*', '+', '?'):
                        seq.append((f'"{pattern[i]}"', True))
                        i += 1
            return join_seq()
-        return self._add_rule(name, transform()[0])
+        return self._add_rule(
            name,
            to_rule(transform()) if self._raw_pattern \
                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
    def _resolve_ref(self, ref):
@ -510,6 +522,11 @@ def main(args_in = None):
        action='store_true',
        default=False,
        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
    parser.add_argument(
        '--raw-pattern',
        action='store_true',
        default=False,
        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
    args = parser.parse_args(args_in)
@ -528,7 +545,8 @@ def main(args_in = None):
    converter = SchemaConverter(
        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
        allow_fetch=args.allow_fetch,
-        dotall=args.dotall)
+        dotall=args.dotall,
        raw_pattern=args.raw_pattern)
    schema = converter.resolve_refs(schema, url)
    converter.visit(schema, '')
    print(converter.format_grammar())
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@ -11,6 +11,7 @@ print(subprocess.check_output(
      "json-schema-to-grammar.py"),
    *rest,
    "-",
    "--raw-pattern",
  ],
  text=True,
  input=json.dumps({
--- a/examples/server/json-schema-to-grammar.cpp
+++ b/examples/server/json-schema-to-grammar.cpp
@ -62,7 +62,7 @@ unordered_map<char, string> GRAMMAR_LITERAL_ESCAPES = {
 };
 unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'{', '*', '+', '?'};
+unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 template <typename Iterator>
 string join(Iterator begin, Iterator end, const string& separator) {
@ -186,9 +186,15 @@ private:
        size_t i = 0;
        size_t length = sub_pattern.length();
-        std::function<pair<string, bool>()> transform = [&]() -> pair<string, bool> {
+        using literal_or_rule = pair<string, bool>;
        auto to_rule = [&](const literal_or_rule& ls) {
            auto is_literal = ls.second;
            auto s = ls.first;
            return is_literal ? "\"" + s + "\"" : s;
        };
        std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
            size_t start = i;
-            vector<pair<string, bool>> seq;
+            vector<literal_or_rule> seq;
            auto get_dot = [&]() {
                string rule;
@ -202,28 +208,32 @@ private:
            // Joins the sequence, merging consecutive literals together.
            auto join_seq = [&]() {
-                vector<string> results;
+                vector<literal_or_rule> ret;
                string literal;
                auto flush_literal = [&]() {
                  if (literal.empty()) {
                    return false;
                  }
-                  results.push_back("\"" + literal + "\"");
+                  ret.push_back(make_pair(literal, true));
                  literal.clear();
                  return true;
                };
                for (const auto& item : seq) {
-                    if (item.second) {
+                    auto is_literal = item.second;
-                      literal += item.first.substr(1, item.first.length() - 2);
+                    if (is_literal) {
                      literal += item.first;
                    } else {
                      flush_literal();
-                      results.push_back(item.first);
+                      ret.push_back(item);
                    }
                }
-                if (flush_literal() && results.size() == 1) {
+                flush_literal();
-                    return make_pair(results[0], true);
+
                vector<string> results;
                for (const auto& item : ret) {
                    results.push_back(to_rule(item));
                }
                return make_pair(join(results.begin(), results.end(), " "), false);
            };
@ -240,8 +250,7 @@ private:
                            _warnings.push_back("Unsupported pattern syntax");
                        }
                    }
-                    auto sub_result = transform();
+                    seq.push_back(make_pair("(" + to_rule(transform()) + ")", false));
                    seq.push_back(make_pair("(" + sub_result.first + ")", false));
                } else if (c == ')') {
                    i++;
                    if (start > 0 && sub_pattern[start - 1] != '(') {
@ -270,7 +279,7 @@ private:
                    seq.push_back(make_pair("|", false));
                    i++;
                } else if (c == '*' || c == '+' || c == '?') {
-                    seq.back().first += c;
+                    seq.back() = make_pair(to_rule(seq.back()) + c, false);
                    i++;
                } else if (c == '{') {
                    string curly_brackets = string(1, c);
@ -287,17 +296,22 @@ private:
                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = numeric_limits<int>::max();
-                    if (nums.size() == 1) {
+                    try {
-                        min_times = max_times = stoi(nums[0]);
+                        if (nums.size() == 1) {
-                    } else if (nums.size() != 2) {
+                            min_times = max_times = std::stoi(nums[0]);
-                        _errors.push_back("Wrong number of values in curly brackets");
+                        } else if (nums.size() != 2) {
-                    } else {
+                            _errors.push_back("Wrong number of values in curly brackets");
-                        if (!nums[0].empty()) {
+                        } else {
-                            min_times = stoi(nums[0]);
+                            if (!nums[0].empty()) {
-                        }
+                                min_times = std::stoi(nums[0]);
-                        if (!nums[1].empty()) {
+                            }
-                            max_times = stoi(nums[1]);
+                            if (!nums[1].empty()) {
                                max_times = std::stoi(nums[1]);
                            }
                        }
                    } catch (const std::invalid_argument& e) {
                        _errors.push_back("Invalid number in curly brackets");
                        return make_pair("", false);
                    }
                    auto &last = seq.back();
                    auto &sub = last.first;
@ -346,36 +360,39 @@ private:
                    }
                } else {
                    string literal;
-                    while (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end() &&
+                    auto is_non_literal = [&](char c) {
-                          (i == length - 1 || ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(sub_pattern[i + 1]) == ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end())) {
+                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
                    };
                    while (i < length) {
                        if (sub_pattern[i] == '\\' && i < length - 1) {
-                            i++;
+                            char next = sub_pattern[i + 1];
-                            if (NON_LITERAL_SET.find(sub_pattern[i]) != NON_LITERAL_SET.end()) {
+                            if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
                                i++;
                                literal += sub_pattern[i];
                                i++;
                            } else {
-                                literal += "\\" + string(1, sub_pattern[i]);
+                                literal += sub_pattern.substr(i, 2);
                                i += 2;
                            }
                        } else if (sub_pattern[i] == '"') {
                            literal += "\\\"";
                            i++;
-                        } else {
+                        } else if (!is_non_literal(sub_pattern[i]) &&
-                            if (sub_pattern[i] == '"') {
+                                (i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
                                literal += "\\";
                            }
                            literal += sub_pattern[i];
                            i++;
                        } else {
                            break;
                        }
                    }
                    if (!literal.empty()) {
-                        seq.push_back(make_pair("\"" + literal + "\"", true));
+                        seq.push_back(make_pair(literal, true));
                    }
                    if (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end()) {
                        seq.push_back(make_pair("\"" + string(1, sub_pattern[i]) + "\"", true));
                        i++;
                    }
                }
            }
            return join_seq();
        };
-        return _add_rule(name, transform().first);
+        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
    }
    string _resolve_ref(const string& ref) {
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -35,7 +35,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('{*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
 export class SchemaConverter {
  constructor(options) {
@ -163,6 +163,9 @@ export class SchemaConverter {
      return this._addRule('dot', rule);
    };
    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
    const transform = () => {
      const start = i;
      // For each component of this sequence, store its string representation and whether it's a literal.
@ -175,8 +178,7 @@ export class SchemaConverter {
        const ret = [];
        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
          if (isLiteral) {
-            const lit = [...g].map(x => x[0].slice(1, -1)).join('');
+            ret.push([[...g].map(x => x[0]).join(''), true]);
            ret.push([`"${lit}"`, true]);
          } else {
            ret.push(...g);
          }
@ -184,7 +186,7 @@ export class SchemaConverter {
        if (ret.length === 1) {
          return ret[0];
        }
-        return [ret.map(x => x[0]).join(' '), false];
+        return [ret.map(x => toRule(x)).join(' '), false];
      };
      while (i < length) {
@ -199,7 +201,7 @@ export class SchemaConverter {
              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
            }
          }
-          seq.push([`(${transform()[0]})`, false]);
+          seq.push([`(${toRule(transform())})`, false]);
        } else if (c === ')') {
          i += 1;
          if (start <= 0 || pattern[start - 1] !== '(') {
@ -228,7 +230,7 @@ export class SchemaConverter {
          seq.push(['|', false]);
          i += 1;
        } else if (c === '*' || c === '+' || c === '?') {
-          seq[seq.length - 1] = [`${seq[seq.length - 1][0]}${c}`, false];
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
          i += 1;
        } else if (c === '{') {
          let curlyBrackets = c;
@ -278,33 +280,31 @@ export class SchemaConverter {
            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
          }
        } else {
-          let lit = '';
+          let literal = '';
-          while (i < length && !NON_LITERAL_SET.has(pattern[i]) &&
+          while (i < length) {
                 !(i < length - 1 && ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(pattern[i + 1]))) {
            if (pattern[i] === '\\' && i < length - 1) {
-              i += 1;
+              const next = pattern[i + 1];
-              if (NON_LITERAL_SET.has(pattern[i])) {
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
-                // Escapes in regular expressions that aren't escaped in GBNF literals
+                i += 1;
-                lit += pattern[i];
+                literal += pattern[i];
                i += 1;
              } else {
-                lit += `\\${pattern[i]}`;
+                literal += pattern.slice(i, i + 2);
                i += 2;
              }
            } else if (pattern[i] === '"') {
              literal += '\\"';
              i += 1;
            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
              literal += pattern[i];
              i += 1;
            } else {
-              if (pattern[i] === '"') {
+              break;
                  lit += '\\';
              }
              lit += pattern[i];
              i += 1;
            }
          }
-          if (lit) {
+          if (literal !== '') {
-            seq.push([`"${lit}"`, true]);
+            seq.push([literal, true]);
          }
          if (i < length && !NON_LITERAL_SET.has(pattern[i])) {
            seq.push([`"${pattern[i]}"`, true]);
            i += 1;
          }
        }
      }
@ -312,7 +312,7 @@ export class SchemaConverter {
      return joinSeq();
    };
-    return this._addRule(name, transform()[0]);
+    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
  }
  _resolveRef(ref) {
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -331,6 +331,45 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
    )"""
  });
  test({
    SUCCESS,
    "simple regexp",
    R"""({
      "type": "string",
      "pattern": "^abc?d*efg+(hij)?kl$"
    })""",
    R"""(
      root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
      space ::= " "?
    )"""
  });
  test({
    SUCCESS,
    "regexp escapes",
    R"""({
      "type": "string",
      "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
    })""",
    R"""(
      root ::= "\"" "[]{}()|+*?" "\"" space
      space ::= " "?
    )"""
  });
  test({
    SUCCESS,
    "regexp quote",
    R"""({
      "type": "string",
      "pattern": "^\"$"
    })""",
    R"""(
      root ::= "\"" "\"" "\"" space
      space ::= " "?
    )"""
  });
  test({
    SUCCESS,
    "regexp",
@ -340,7 +379,7 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
    })""",
    R"""(
      dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
-      root ::= ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot
+      root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
      root-1 ::= [0-9]
      space ::= " "?
    )"""