json: fix string patterns (was missing quotes)

2024-03-18 04:06:23 +00:00 · 2024-03-18 04:06:23 +00:00 · 24f0b941cf
commit 24f0b941cf
parent dd922a4da3
7 changed files with 1579 additions and 1500 deletions
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -44,16 +44,17 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}

 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('{*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')

 DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
 TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits

 class SchemaConverter:
-    def __init__(self, *, prop_order, allow_fetch, dotall):
+    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
        self._prop_order = prop_order
        self._allow_fetch = allow_fetch
        self._dotall = dotall
+        self._raw_pattern = raw_pattern
        self._rules = {'space': SPACE_RULE}
        self._refs = {}
        self._refs_being_resolved = set()
@ -152,6 +153,10 @@ class SchemaConverter:
        i = 0
        length = len(pattern)

+        def to_rule(s: Tuple[str, bool]) -> str:
+            (txt, is_literal) = s
+            return "\"" + txt + "\"" if is_literal else txt
+        
        def transform() -> Tuple[str, bool]:
            '''
                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
@ -180,13 +185,12 @@ class SchemaConverter:
                ret = []
                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
                    if is_literal:
-                        lit = ''.join(x[0][1:-1] for x in g)
-                        ret.append((f'"{lit}"', True))
+                        ret.append((''.join(x[0] for x in g), True))
                    else:
                        ret.extend(g)
                if len(ret) == 1:
                    return ret[0]
-                return (' '.join(x[0] for x in seq), False)
+                return (' '.join(to_rule(x) for x in seq), False)

            while i < length:
                c = pattern[i]
@ -197,7 +201,7 @@ class SchemaConverter:
                    i += 1
                    if i < length:
                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
-                    seq.append((f'({transform()[0]})', False))
+                    seq.append((f'({to_rule(transform())})', False))
                elif c == ')':
                    i += 1
                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
@ -220,7 +224,7 @@ class SchemaConverter:
                    seq.append(('|', False))
                    i += 1
                elif c in ('*', '+', '?'):
-                    seq[-1] = (f'{seq[-1][0]}{c}', False)
+                    seq[-1] = (to_rule(seq[-1]) + c, False)
                    i += 1
                elif c == '{':
                    curly_brackets = c
@ -232,13 +236,18 @@ class SchemaConverter:
                    curly_brackets += '}'
                    i += 1
                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
-                    if len(nums) == 1:
-                        min_times = int(nums[0])
-                        max_times = min_times
-                    else:
-                        assert len(nums) == 2
-                        min_times = int(nums[0]) if nums[0] else 0
-                        max_times = int(nums[1]) if nums[1] else None
+                    min_times = 0
+                    max_times = None
+                    try:
+                        if len(nums) == 1:
+                            min_times = int(nums[0])
+                            max_times = min_times
+                        else:
+                            assert len(nums) == 2
+                            min_times = int(nums[0]) if nums[0] else 0
+                            max_times = int(nums[1]) if nums[1] else None
+                    except ValueError:
+                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')

                    (sub, sub_is_literal) = seq[-1]

@ -263,32 +272,35 @@ class SchemaConverter:
                            False
                        )
                else:
-                    lit = ''
-                    while i < length and pattern[i] not in NON_LITERAL_SET \
-                            and not (i < length - 1 and pattern[i+1] in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS):
+                    literal = ''
+                    while i < length:
                        if pattern[i] == '\\' and i < length - 1:
-                            i += 1
-                            if pattern[i] in NON_LITERAL_SET:
-                                # Escapes in regular expressions that aren't escaped in GBNF literals
-                                lit += pattern[i]
+                            next = pattern[i + 1]
+                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
+                                i += 1
+                                literal += pattern[i]
+                                i += 1
                            else:
-                                lit += f'\\{pattern[i]}'
+                                literal += pattern[i:i+2]
+                                i += 2
+                        elif pattern[i] == '"' and not self._raw_pattern:
+                            literal += '\\"'
+                            i += 1
+                        elif pattern[i] not in NON_LITERAL_SET and \
+                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
+                            literal += pattern[i]
                            i += 1
                        else:
-                            if pattern[i] == '"':
-                                lit += '\\'
-                            lit += pattern[i]
-                            i += 1
-                    if lit:
-                        seq.append((f'"{lit}"', True))
-
-                    if i < length and pattern[i] not in ('.', '(', ')', '|', '[', '{', '*', '+', '?'):
-                        seq.append((f'"{pattern[i]}"', True))
-                        i += 1
+                            break
+                    if literal:
+                        seq.append((literal, True))

            return join_seq()

-        return self._add_rule(name, transform()[0])
+        return self._add_rule(
+            name,
+            to_rule(transform()) if self._raw_pattern \
+                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")


    def _resolve_ref(self, ref):
@ -510,6 +522,11 @@ def main(args_in = None):
        action='store_true',
        default=False,
        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
+    parser.add_argument(
+        '--raw-pattern',
+        action='store_true',
+        default=False,
+        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')

    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
    args = parser.parse_args(args_in)
@ -528,7 +545,8 @@ def main(args_in = None):
    converter = SchemaConverter(
        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
        allow_fetch=args.allow_fetch,
-        dotall=args.dotall)
+        dotall=args.dotall,
+        raw_pattern=args.raw_pattern)
    schema = converter.resolve_refs(schema, url)
    converter.visit(schema, '')
    print(converter.format_grammar())
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@ -11,6 +11,7 @@ print(subprocess.check_output(
      "json-schema-to-grammar.py"),
    *rest,
    "-",
+    "--raw-pattern",
  ],
  text=True,
  input=json.dumps({
--- a/examples/server/json-schema-to-grammar.cpp
+++ b/examples/server/json-schema-to-grammar.cpp
@ -62,7 +62,7 @@ unordered_map<char, string> GRAMMAR_LITERAL_ESCAPES = {
 };

 unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'{', '*', '+', '?'};
+unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

 template <typename Iterator>
 string join(Iterator begin, Iterator end, const string& separator) {
@ -186,9 +186,15 @@ private:
        size_t i = 0;
        size_t length = sub_pattern.length();

-        std::function<pair<string, bool>()> transform = [&]() -> pair<string, bool> {
+        using literal_or_rule = pair<string, bool>;
+        auto to_rule = [&](const literal_or_rule& ls) {
+            auto is_literal = ls.second;
+            auto s = ls.first;
+            return is_literal ? "\"" + s + "\"" : s;
+        };
+        std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
            size_t start = i;
-            vector<pair<string, bool>> seq;
+            vector<literal_or_rule> seq;

            auto get_dot = [&]() {
                string rule;
@ -202,28 +208,32 @@ private:

            // Joins the sequence, merging consecutive literals together.
            auto join_seq = [&]() {
-                vector<string> results;
+                vector<literal_or_rule> ret;

                string literal;
                auto flush_literal = [&]() {
                  if (literal.empty()) {
                    return false;
                  }
-                  results.push_back("\"" + literal + "\"");
+                  ret.push_back(make_pair(literal, true));
                  literal.clear();
                  return true;
                };

                for (const auto& item : seq) {
-                    if (item.second) {
-                      literal += item.first.substr(1, item.first.length() - 2);
+                    auto is_literal = item.second;
+                    if (is_literal) {
+                      literal += item.first;
                    } else {
                      flush_literal();
-                      results.push_back(item.first);
+                      ret.push_back(item);
                    }
                }
-                if (flush_literal() && results.size() == 1) {
-                    return make_pair(results[0], true);
+                flush_literal();
+
+                vector<string> results;
+                for (const auto& item : ret) {
+                    results.push_back(to_rule(item));
                }
                return make_pair(join(results.begin(), results.end(), " "), false);
            };
@ -240,8 +250,7 @@ private:
                            _warnings.push_back("Unsupported pattern syntax");
                        }
                    }
-                    auto sub_result = transform();
-                    seq.push_back(make_pair("(" + sub_result.first + ")", false));
+                    seq.push_back(make_pair("(" + to_rule(transform()) + ")", false));
                } else if (c == ')') {
                    i++;
                    if (start > 0 && sub_pattern[start - 1] != '(') {
@ -270,7 +279,7 @@ private:
                    seq.push_back(make_pair("|", false));
                    i++;
                } else if (c == '*' || c == '+' || c == '?') {
-                    seq.back().first += c;
+                    seq.back() = make_pair(to_rule(seq.back()) + c, false);
                    i++;
                } else if (c == '{') {
                    string curly_brackets = string(1, c);
@ -287,17 +296,22 @@ private:
                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
                    int min_times = 0;
                    int max_times = numeric_limits<int>::max();
-                    if (nums.size() == 1) {
-                        min_times = max_times = stoi(nums[0]);
-                    } else if (nums.size() != 2) {
-                        _errors.push_back("Wrong number of values in curly brackets");
-                    } else {
-                        if (!nums[0].empty()) {
-                            min_times = stoi(nums[0]);
-                        }
-                        if (!nums[1].empty()) {
-                            max_times = stoi(nums[1]);
+                    try {
+                        if (nums.size() == 1) {
+                            min_times = max_times = std::stoi(nums[0]);
+                        } else if (nums.size() != 2) {
+                            _errors.push_back("Wrong number of values in curly brackets");
+                        } else {
+                            if (!nums[0].empty()) {
+                                min_times = std::stoi(nums[0]);
+                            }
+                            if (!nums[1].empty()) {
+                                max_times = std::stoi(nums[1]);
+                            }
                        }
+                    } catch (const std::invalid_argument& e) {
+                        _errors.push_back("Invalid number in curly brackets");
+                        return make_pair("", false);
                    }
                    auto &last = seq.back();
                    auto &sub = last.first;
@ -346,36 +360,39 @@ private:
                    }
                } else {
                    string literal;
-                    while (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end() &&
-                          (i == length - 1 || ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(sub_pattern[i + 1]) == ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end())) {
+                    auto is_non_literal = [&](char c) {
+                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
+                    };
+                    while (i < length) {
                        if (sub_pattern[i] == '\\' && i < length - 1) {
-                            i++;
-                            if (NON_LITERAL_SET.find(sub_pattern[i]) != NON_LITERAL_SET.end()) {
+                            char next = sub_pattern[i + 1];
+                            if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
+                                i++;
                                literal += sub_pattern[i];
+                                i++;
                            } else {
-                                literal += "\\" + string(1, sub_pattern[i]);
+                                literal += sub_pattern.substr(i, 2);
+                                i += 2;
                            }
+                        } else if (sub_pattern[i] == '"') {
+                            literal += "\\\"";
                            i++;
-                        } else {
-                            if (sub_pattern[i] == '"') {
-                                literal += "\\";
-                            }
+                        } else if (!is_non_literal(sub_pattern[i]) &&
+                                (i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
                            literal += sub_pattern[i];
                            i++;
+                        } else {
+                            break;
                        }
                    }
                    if (!literal.empty()) {
-                        seq.push_back(make_pair("\"" + literal + "\"", true));
-                    }
-                    if (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end()) {
-                        seq.push_back(make_pair("\"" + string(1, sub_pattern[i]) + "\"", true));
-                        i++;
+                        seq.push_back(make_pair(literal, true));
                    }
                }
            }
            return join_seq();
        };
-        return _add_rule(name, transform().first);
+        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
    }

    string _resolve_ref(const string& ref) {
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -35,7 +35,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };

 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('{*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');

 export class SchemaConverter {
  constructor(options) {
@ -163,6 +163,9 @@ export class SchemaConverter {
      return this._addRule('dot', rule);
    };

+
+    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
+
    const transform = () => {
      const start = i;
      // For each component of this sequence, store its string representation and whether it's a literal.
@ -175,8 +178,7 @@ export class SchemaConverter {
        const ret = [];
        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
          if (isLiteral) {
-            const lit = [...g].map(x => x[0].slice(1, -1)).join('');
-            ret.push([`"${lit}"`, true]);
+            ret.push([[...g].map(x => x[0]).join(''), true]);
          } else {
            ret.push(...g);
          }
@ -184,7 +186,7 @@ export class SchemaConverter {
        if (ret.length === 1) {
          return ret[0];
        }
-        return [ret.map(x => x[0]).join(' '), false];
+        return [ret.map(x => toRule(x)).join(' '), false];
      };

      while (i < length) {
@ -199,7 +201,7 @@ export class SchemaConverter {
              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
            }
          }
-          seq.push([`(${transform()[0]})`, false]);
+          seq.push([`(${toRule(transform())})`, false]);
        } else if (c === ')') {
          i += 1;
          if (start <= 0 || pattern[start - 1] !== '(') {
@ -228,7 +230,7 @@ export class SchemaConverter {
          seq.push(['|', false]);
          i += 1;
        } else if (c === '*' || c === '+' || c === '?') {
-          seq[seq.length - 1] = [`${seq[seq.length - 1][0]}${c}`, false];
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
          i += 1;
        } else if (c === '{') {
          let curlyBrackets = c;
@ -278,33 +280,31 @@ export class SchemaConverter {
            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
          }
        } else {
-          let lit = '';
-          while (i < length && !NON_LITERAL_SET.has(pattern[i]) &&
-                 !(i < length - 1 && ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(pattern[i + 1]))) {
+          let literal = '';
+          while (i < length) {
            if (pattern[i] === '\\' && i < length - 1) {
-              i += 1;
-              if (NON_LITERAL_SET.has(pattern[i])) {
-                // Escapes in regular expressions that aren't escaped in GBNF literals
-                lit += pattern[i];
+              const next = pattern[i + 1];
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
+                i += 1;
+                literal += pattern[i];
+                i += 1;
              } else {
-                lit += `\\${pattern[i]}`;
+                literal += pattern.slice(i, i + 2);
+                i += 2;
              }
+            } else if (pattern[i] === '"') {
+              literal += '\\"';
+              i += 1;
+            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
+                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
+              literal += pattern[i];
              i += 1;
            } else {
-              if (pattern[i] === '"') {
-                  lit += '\\';
-              }
-              lit += pattern[i];
-              i += 1;
+              break;
            }
          }
-          if (lit) {
-            seq.push([`"${lit}"`, true]);
-          }
-
-          if (i < length && !NON_LITERAL_SET.has(pattern[i])) {
-            seq.push([`"${pattern[i]}"`, true]);
-            i += 1;
+          if (literal !== '') {
+            seq.push([literal, true]);
          }
        }
      }
@ -312,7 +312,7 @@ export class SchemaConverter {
      return joinSeq();
    };

-    return this._addRule(name, transform()[0]);
+    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
  }

  _resolveRef(ref) {
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -331,6 +331,45 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
    )"""
  });

+  test({
+    SUCCESS,
+    "simple regexp",
+    R"""({
+      "type": "string",
+      "pattern": "^abc?d*efg+(hij)?kl$"
+    })""",
+    R"""(
+      root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
+      space ::= " "?
+    )"""
+  });
+
+  test({
+    SUCCESS,
+    "regexp escapes",
+    R"""({
+      "type": "string",
+      "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
+    })""",
+    R"""(
+      root ::= "\"" "[]{}()|+*?" "\"" space
+      space ::= " "?
+    )"""
+  });
+
+  test({
+    SUCCESS,
+    "regexp quote",
+    R"""({
+      "type": "string",
+      "pattern": "^\"$"
+    })""",
+    R"""(
+      root ::= "\"" "\"" "\"" space
+      space ::= " "?
+    )"""
+  });
+
  test({
    SUCCESS,
    "regexp",
@ -340,7 +379,7 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
    })""",
    R"""(
      dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
-      root ::= ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot
+      root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
      root-1 ::= [0-9]
      space ::= " "?
    )"""