From 4e6375606ded5493fee9797ac1e39d352aee4da7 Mon Sep 17 00:00:00 2001 From: ochafik Date: Tue, 11 Jun 2024 04:21:06 +0100 Subject: [PATCH] fix not_strings & port to js+py --- common/json-schema-to-grammar.cpp | 125 +++++++++--------- examples/json_schema_to_grammar.py | 90 +++++++------ .../server/public/json-schema-to-grammar.mjs | 64 ++++++++- tests/test-json-schema-to-grammar.cpp | 12 +- 4 files changed, 182 insertions(+), 109 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index fe41c7295..dfcbd58ed 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -160,64 +160,6 @@ static std::string format_literal(const std::string & literal) { return "\"" + escaped + "\""; } -/* - Returns a rule that matches a JSON string that is none of the provided strings - - not_strings({"and", "also"}) - -> ["] ( [a] ([l] ([s] ([^"o]) | [^"s]) | [n] ([^"d]) | [^"ln]) | [^"a] ) char* ["] -*/ -std::string not_strings(const std::vector & strings) { - - struct TrieNode { - std::map children; - bool is_end_of_string; - - void insert(const std::string & string) { - auto node = this; - for (char c : string) { - node = &node->children[c]; - } - node->is_end_of_string = true; - } - }; - - TrieNode trie; - for (const auto & s : strings) { - trie.insert(s); - } - - std::ostringstream out; - out << "[\"] ( "; - std::function visit = [&](const TrieNode & node) { - std::ostringstream rejects; - auto first = true; - for (const auto & kv : node.children) { - rejects << kv.first; - if (kv.second.is_end_of_string) { - continue; - } - if (first) { - first = false; - } else { - out << " | "; - } - out << "[" << kv.first << "] ("; - visit(kv.second); - out << ")"; - } - if (!node.children.empty()) { - if (!first) { - out << " | "; - } - out << "[^\"" << rejects.str() << "]"; - } - }; - visit(trie); - - out << " ) char* [\"] space"; - return out.str(); -} - class SchemaConverter { private: std::function _fetch_json; @@ -445,6 +387,67 @@ private: return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); } + /* + Returns a rule that matches a JSON string that is none of the provided strings + + not_strings({"and", "also"}) + -> ["] ( [a] ([l] ([s] ([^"o]) | [^"s]) | [n] ([^"d]) | [^"ln]) | [^"a] ) char* ["] + */ + std::string _not_strings(const std::vector & strings) { + + struct TrieNode { + std::map children; + bool is_end_of_string; + + void insert(const std::string & string) { + auto node = this; + for (char c : string) { + node = &node->children[c]; + } + node->is_end_of_string = true; + } + }; + + TrieNode trie; + for (const auto & s : strings) { + trie.insert(s); + } + + std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); + std::ostringstream out; + out << "[\"] ( "; + std::function visit = [&](const TrieNode & node) { + std::ostringstream rejects; + auto first = true; + for (const auto & kv : node.children) { + rejects << kv.first; + if (first) { + first = false; + } else { + out << " | "; + } + out << "[" << kv.first << "]"; + if (kv.second.is_end_of_string) { + out << " " << char_rule << "+"; + } else { + out << " ("; + visit(kv.second); + out << ")"; + } + } + if (!node.children.empty()) { + if (!first) { + out << " | "; + } + out << "[^\"" << rejects.str() << "] " << char_rule << "*"; + } + }; + visit(trie); + + out << " )? [\"] space"; + return out.str(); + } + std::string _resolve_ref(const std::string & ref) { std::string ref_name = ref.substr(ref.find_last_of('/') + 1); if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) { @@ -484,11 +487,13 @@ private: } if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; - std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); + std::string value_rule = + additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") + : _add_primitive("value", PRIMITIVE_RULES.at("value")); auto key_rule = prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string")) - : _add_rule(sub_name + "-k", not_strings(prop_names)); + : _add_rule(sub_name + "-k", _not_strings(prop_names)); std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule); prop_kv_rule_names["*"] = kv_rule; optional_props.push_back("*"); diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 252d63fd2..cb255a6d0 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -71,47 +71,6 @@ NON_LITERAL_SET = set('|.()[]{}*+?') ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') -def not_strings(strings): - class TrieNode: - def __init__(self): - self.children = {} - self.is_end_of_string = False - - def insert(self, string): - node = self - for c in string: - node = node.children.setdefault(c, TrieNode()) - node.is_end_of_string = True - - trie = TrieNode() - for s in strings: - trie.insert(s) - - out = ['["] ( '] - - def visit(node): - rejects = [] - first = True - for c, child in node.children.items(): - rejects.append(c) - if child.is_end_of_string: - continue - if first: - first = False - else: - out.append(' | ') - out.append(f'[{c}] (') - visit(child) - out.append(')') - if node.children: - if not first: - out.append(' | ') - out.append(f'[^"{"".join(rejects)}]') - visit(trie) - - out.append(' ) char* ["] space') - return ''.join(out) - class SchemaConverter: def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): self._prop_order = prop_order @@ -153,6 +112,51 @@ class SchemaConverter: return ''.join(('(', *recurse(0), ')')) + def _not_strings(self, strings): + class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_string = False + + def insert(self, string): + node = self + for c in string: + node = node.children.setdefault(c, TrieNode()) + node.is_end_of_string = True + + trie = TrieNode() + for s in strings: + trie.insert(s) + + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + out = ['["] ( '] + + def visit(node): + rejects = [] + first = True + for c in sorted(node.children.keys()): + child = node.children[c] + rejects.append(c) + if first: + first = False + else: + out.append(' | ') + out.append(f'[{c}]') + if (child.is_end_of_string): + out.append(f' {char_rule}+') + else: + out.append(f' (') + visit(child) + out.append(')') + if node.children: + if not first: + out.append(' | ') + out.append(f'[^"{"".join(rejects)}] {char_rule}*') + visit(trie) + + out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + return ''.join(out) + def _add_rule(self, name, rule): esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: @@ -513,8 +517,8 @@ class SchemaConverter: sub_name = f'{name}{"-" if name else ""}additional' value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \ - else self._add_rule(f'{sub_name}-k', not_strings(sorted_props)) - + else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props)) + prop_kv_rule_names["*"] = self._add_rule( f'{sub_name}-kv', f'{key_rule} ":" space {value_rule}' diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index ff763a1e0..96e4daae2 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -337,6 +337,63 @@ export class SchemaConverter { return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") } + _notStrings(strings) { + class TrieNode { + constructor() { + this.children = {}; + this.isEndOfString = false; + } + + insert(str) { + let node = this; + for (const c of str) { + node = node.children[c] = node.children[c] || new TrieNode(); + } + node.isEndOfString = true; + } + } + + const trie = new TrieNode(); + for (const s of strings) { + trie.insert(s); + } + + const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']); + const out = ['["] ( ']; + + const visit = (node) => { + const rejects = []; + let first = true; + for (const c of Object.keys(node.children).sort()) { + const child = node.children[c]; + rejects.push(c); + if (!first) { + out.push(' | '); + } + out.push(`[${c}]`); + if (child.isEndOfString) { + out.push(` ${charRuleName}+`); + } else { + out.push(' ('); + visit(child); + out.push(')'); + } + first = false; + } + if (Object.keys(node.children).length > 0) { + if (!first) { + out.push(' | '); + } + out.push(`[^"${rejects.join('')}] ${charRuleName}*`); + } + }; + + visit(trie); + + out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`); + return out.join(''); + } + _resolveRef(ref) { let refName = ref.split('/').pop(); if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) { @@ -487,9 +544,14 @@ export class SchemaConverter { if (typeof additionalProperties === 'object' || additionalProperties === true) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`); + + const key_rule = + sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string']) + : this._addRule(`${subName}-k`, this._notStrings(sortedProps)); + propKvRuleNames['*'] = this._addRule( `${subName}-kv`, - `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`); + `${key_rule} ":" space ${valueRule}`); optionalProps.push('*'); } diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 2da6b88e9..20a722768 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -634,7 +634,7 @@ static void test_all(const std::string & lang, std::function