json: fix string patterns (was missing quotes)

This commit is contained in:
ochafik 2024-03-18 04:06:23 +00:00
parent dd922a4da3
commit 24f0b941cf
7 changed files with 1579 additions and 1500 deletions

View file

@ -44,16 +44,17 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
NON_LITERAL_SET = set('|.()[]{}*+?') NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('{*+?') ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])' DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter: class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall): def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
self._prop_order = prop_order self._prop_order = prop_order
self._allow_fetch = allow_fetch self._allow_fetch = allow_fetch
self._dotall = dotall self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {'space': SPACE_RULE} self._rules = {'space': SPACE_RULE}
self._refs = {} self._refs = {}
self._refs_being_resolved = set() self._refs_being_resolved = set()
@ -152,6 +153,10 @@ class SchemaConverter:
i = 0 i = 0
length = len(pattern) length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str:
(txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]: def transform() -> Tuple[str, bool]:
''' '''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
@ -180,13 +185,12 @@ class SchemaConverter:
ret = [] ret = []
for is_literal, g in itertools.groupby(seq, lambda x: x[1]): for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
if is_literal: if is_literal:
lit = ''.join(x[0][1:-1] for x in g) ret.append((''.join(x[0] for x in g), True))
ret.append((f'"{lit}"', True))
else: else:
ret.extend(g) ret.extend(g)
if len(ret) == 1: if len(ret) == 1:
return ret[0] return ret[0]
return (' '.join(x[0] for x in seq), False) return (' '.join(to_rule(x) for x in seq), False)
while i < length: while i < length:
c = pattern[i] c = pattern[i]
@ -197,7 +201,7 @@ class SchemaConverter:
i += 1 i += 1
if i < length: if i < length:
assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
seq.append((f'({transform()[0]})', False)) seq.append((f'({to_rule(transform())})', False))
elif c == ')': elif c == ')':
i += 1 i += 1
assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}' assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
@ -220,7 +224,7 @@ class SchemaConverter:
seq.append(('|', False)) seq.append(('|', False))
i += 1 i += 1
elif c in ('*', '+', '?'): elif c in ('*', '+', '?'):
seq[-1] = (f'{seq[-1][0]}{c}', False) seq[-1] = (to_rule(seq[-1]) + c, False)
i += 1 i += 1
elif c == '{': elif c == '{':
curly_brackets = c curly_brackets = c
@ -232,6 +236,9 @@ class SchemaConverter:
curly_brackets += '}' curly_brackets += '}'
i += 1 i += 1
nums = [s.strip() for s in curly_brackets[1:-1].split(',')] nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
min_times = 0
max_times = None
try:
if len(nums) == 1: if len(nums) == 1:
min_times = int(nums[0]) min_times = int(nums[0])
max_times = min_times max_times = min_times
@ -239,6 +246,8 @@ class SchemaConverter:
assert len(nums) == 2 assert len(nums) == 2
min_times = int(nums[0]) if nums[0] else 0 min_times = int(nums[0]) if nums[0] else 0
max_times = int(nums[1]) if nums[1] else None max_times = int(nums[1]) if nums[1] else None
except ValueError:
raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
(sub, sub_is_literal) = seq[-1] (sub, sub_is_literal) = seq[-1]
@ -263,32 +272,35 @@ class SchemaConverter:
False False
) )
else: else:
lit = '' literal = ''
while i < length and pattern[i] not in NON_LITERAL_SET \ while i < length:
and not (i < length - 1 and pattern[i+1] in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS):
if pattern[i] == '\\' and i < length - 1: if pattern[i] == '\\' and i < length - 1:
next = pattern[i + 1]
if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
i += 1 i += 1
if pattern[i] in NON_LITERAL_SET: literal += pattern[i]
# Escapes in regular expressions that aren't escaped in GBNF literals
lit += pattern[i]
else:
lit += f'\\{pattern[i]}'
i += 1 i += 1
else: else:
if pattern[i] == '"': literal += pattern[i:i+2]
lit += '\\' i += 2
lit += pattern[i] elif pattern[i] == '"' and not self._raw_pattern:
literal += '\\"'
i += 1 i += 1
if lit: elif pattern[i] not in NON_LITERAL_SET and \
seq.append((f'"{lit}"', True)) (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
literal += pattern[i]
if i < length and pattern[i] not in ('.', '(', ')', '|', '[', '{', '*', '+', '?'):
seq.append((f'"{pattern[i]}"', True))
i += 1 i += 1
else:
break
if literal:
seq.append((literal, True))
return join_seq() return join_seq()
return self._add_rule(name, transform()[0]) return self._add_rule(
name,
to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
def _resolve_ref(self, ref): def _resolve_ref(self, ref):
@ -510,6 +522,11 @@ def main(args_in = None):
action='store_true', action='store_true',
default=False, default=False,
help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns') help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
parser.add_argument(
'--raw-pattern',
action='store_true',
default=False,
help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)') parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
@ -528,7 +545,8 @@ def main(args_in = None):
converter = SchemaConverter( converter = SchemaConverter(
prop_order={name: idx for idx, name in enumerate(args.prop_order)}, prop_order={name: idx for idx, name in enumerate(args.prop_order)},
allow_fetch=args.allow_fetch, allow_fetch=args.allow_fetch,
dotall=args.dotall) dotall=args.dotall,
raw_pattern=args.raw_pattern)
schema = converter.resolve_refs(schema, url) schema = converter.resolve_refs(schema, url)
converter.visit(schema, '') converter.visit(schema, '')
print(converter.format_grammar()) print(converter.format_grammar())

View file

@ -11,6 +11,7 @@ print(subprocess.check_output(
"json-schema-to-grammar.py"), "json-schema-to-grammar.py"),
*rest, *rest,
"-", "-",
"--raw-pattern",
], ],
text=True, text=True,
input=json.dumps({ input=json.dumps({

View file

@ -62,7 +62,7 @@ unordered_map<char, string> GRAMMAR_LITERAL_ESCAPES = {
}; };
unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'{', '*', '+', '?'}; unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator> template <typename Iterator>
string join(Iterator begin, Iterator end, const string& separator) { string join(Iterator begin, Iterator end, const string& separator) {
@ -186,9 +186,15 @@ private:
size_t i = 0; size_t i = 0;
size_t length = sub_pattern.length(); size_t length = sub_pattern.length();
std::function<pair<string, bool>()> transform = [&]() -> pair<string, bool> { using literal_or_rule = pair<string, bool>;
auto to_rule = [&](const literal_or_rule& ls) {
auto is_literal = ls.second;
auto s = ls.first;
return is_literal ? "\"" + s + "\"" : s;
};
std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
size_t start = i; size_t start = i;
vector<pair<string, bool>> seq; vector<literal_or_rule> seq;
auto get_dot = [&]() { auto get_dot = [&]() {
string rule; string rule;
@ -202,28 +208,32 @@ private:
// Joins the sequence, merging consecutive literals together. // Joins the sequence, merging consecutive literals together.
auto join_seq = [&]() { auto join_seq = [&]() {
vector<string> results; vector<literal_or_rule> ret;
string literal; string literal;
auto flush_literal = [&]() { auto flush_literal = [&]() {
if (literal.empty()) { if (literal.empty()) {
return false; return false;
} }
results.push_back("\"" + literal + "\""); ret.push_back(make_pair(literal, true));
literal.clear(); literal.clear();
return true; return true;
}; };
for (const auto& item : seq) { for (const auto& item : seq) {
if (item.second) { auto is_literal = item.second;
literal += item.first.substr(1, item.first.length() - 2); if (is_literal) {
literal += item.first;
} else { } else {
flush_literal(); flush_literal();
results.push_back(item.first); ret.push_back(item);
} }
} }
if (flush_literal() && results.size() == 1) { flush_literal();
return make_pair(results[0], true);
vector<string> results;
for (const auto& item : ret) {
results.push_back(to_rule(item));
} }
return make_pair(join(results.begin(), results.end(), " "), false); return make_pair(join(results.begin(), results.end(), " "), false);
}; };
@ -240,8 +250,7 @@ private:
_warnings.push_back("Unsupported pattern syntax"); _warnings.push_back("Unsupported pattern syntax");
} }
} }
auto sub_result = transform(); seq.push_back(make_pair("(" + to_rule(transform()) + ")", false));
seq.push_back(make_pair("(" + sub_result.first + ")", false));
} else if (c == ')') { } else if (c == ')') {
i++; i++;
if (start > 0 && sub_pattern[start - 1] != '(') { if (start > 0 && sub_pattern[start - 1] != '(') {
@ -270,7 +279,7 @@ private:
seq.push_back(make_pair("|", false)); seq.push_back(make_pair("|", false));
i++; i++;
} else if (c == '*' || c == '+' || c == '?') { } else if (c == '*' || c == '+' || c == '?') {
seq.back().first += c; seq.back() = make_pair(to_rule(seq.back()) + c, false);
i++; i++;
} else if (c == '{') { } else if (c == '{') {
string curly_brackets = string(1, c); string curly_brackets = string(1, c);
@ -287,18 +296,23 @@ private:
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ","); auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0; int min_times = 0;
int max_times = numeric_limits<int>::max(); int max_times = numeric_limits<int>::max();
try {
if (nums.size() == 1) { if (nums.size() == 1) {
min_times = max_times = stoi(nums[0]); min_times = max_times = std::stoi(nums[0]);
} else if (nums.size() != 2) { } else if (nums.size() != 2) {
_errors.push_back("Wrong number of values in curly brackets"); _errors.push_back("Wrong number of values in curly brackets");
} else { } else {
if (!nums[0].empty()) { if (!nums[0].empty()) {
min_times = stoi(nums[0]); min_times = std::stoi(nums[0]);
} }
if (!nums[1].empty()) { if (!nums[1].empty()) {
max_times = stoi(nums[1]); max_times = std::stoi(nums[1]);
} }
} }
} catch (const std::invalid_argument& e) {
_errors.push_back("Invalid number in curly brackets");
return make_pair("", false);
}
auto &last = seq.back(); auto &last = seq.back();
auto &sub = last.first; auto &sub = last.first;
auto sub_is_literal = last.second; auto sub_is_literal = last.second;
@ -346,36 +360,39 @@ private:
} }
} else { } else {
string literal; string literal;
while (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end() && auto is_non_literal = [&](char c) {
(i == length - 1 || ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(sub_pattern[i + 1]) == ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end())) { return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
};
while (i < length) {
if (sub_pattern[i] == '\\' && i < length - 1) { if (sub_pattern[i] == '\\' && i < length - 1) {
char next = sub_pattern[i + 1];
if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
i++; i++;
if (NON_LITERAL_SET.find(sub_pattern[i]) != NON_LITERAL_SET.end()) {
literal += sub_pattern[i];
} else {
literal += "\\" + string(1, sub_pattern[i]);
}
i++;
} else {
if (sub_pattern[i] == '"') {
literal += "\\";
}
literal += sub_pattern[i]; literal += sub_pattern[i];
i++; i++;
} else {
literal += sub_pattern.substr(i, 2);
i += 2;
}
} else if (sub_pattern[i] == '"') {
literal += "\\\"";
i++;
} else if (!is_non_literal(sub_pattern[i]) &&
(i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
literal += sub_pattern[i];
i++;
} else {
break;
} }
} }
if (!literal.empty()) { if (!literal.empty()) {
seq.push_back(make_pair("\"" + literal + "\"", true)); seq.push_back(make_pair(literal, true));
}
if (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end()) {
seq.push_back(make_pair("\"" + string(1, sub_pattern[i]) + "\"", true));
i++;
} }
} }
} }
return join_seq(); return join_seq();
}; };
return _add_rule(name, transform().first); return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
} }
string _resolve_ref(const string& ref) { string _resolve_ref(const string& ref) {

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -35,7 +35,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
const NON_LITERAL_SET = new Set('|.()[]{}*+?'); const NON_LITERAL_SET = new Set('|.()[]{}*+?');
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('{*+?'); const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
export class SchemaConverter { export class SchemaConverter {
constructor(options) { constructor(options) {
@ -163,6 +163,9 @@ export class SchemaConverter {
return this._addRule('dot', rule); return this._addRule('dot', rule);
}; };
const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
const transform = () => { const transform = () => {
const start = i; const start = i;
// For each component of this sequence, store its string representation and whether it's a literal. // For each component of this sequence, store its string representation and whether it's a literal.
@ -175,8 +178,7 @@ export class SchemaConverter {
const ret = []; const ret = [];
for (const [isLiteral, g] of groupBy(seq, x => x[1])) { for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
if (isLiteral) { if (isLiteral) {
const lit = [...g].map(x => x[0].slice(1, -1)).join(''); ret.push([[...g].map(x => x[0]).join(''), true]);
ret.push([`"${lit}"`, true]);
} else { } else {
ret.push(...g); ret.push(...g);
} }
@ -184,7 +186,7 @@ export class SchemaConverter {
if (ret.length === 1) { if (ret.length === 1) {
return ret[0]; return ret[0];
} }
return [ret.map(x => x[0]).join(' '), false]; return [ret.map(x => toRule(x)).join(' '), false];
}; };
while (i < length) { while (i < length) {
@ -199,7 +201,7 @@ export class SchemaConverter {
throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`); throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
} }
} }
seq.push([`(${transform()[0]})`, false]); seq.push([`(${toRule(transform())})`, false]);
} else if (c === ')') { } else if (c === ')') {
i += 1; i += 1;
if (start <= 0 || pattern[start - 1] !== '(') { if (start <= 0 || pattern[start - 1] !== '(') {
@ -228,7 +230,7 @@ export class SchemaConverter {
seq.push(['|', false]); seq.push(['|', false]);
i += 1; i += 1;
} else if (c === '*' || c === '+' || c === '?') { } else if (c === '*' || c === '+' || c === '?') {
seq[seq.length - 1] = [`${seq[seq.length - 1][0]}${c}`, false]; seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
i += 1; i += 1;
} else if (c === '{') { } else if (c === '{') {
let curlyBrackets = c; let curlyBrackets = c;
@ -278,33 +280,31 @@ export class SchemaConverter {
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false]; seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
} }
} else { } else {
let lit = ''; let literal = '';
while (i < length && !NON_LITERAL_SET.has(pattern[i]) && while (i < length) {
!(i < length - 1 && ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(pattern[i + 1]))) {
if (pattern[i] === '\\' && i < length - 1) { if (pattern[i] === '\\' && i < length - 1) {
const next = pattern[i + 1];
if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
i += 1; i += 1;
if (NON_LITERAL_SET.has(pattern[i])) { literal += pattern[i];
// Escapes in regular expressions that aren't escaped in GBNF literals
lit += pattern[i];
} else {
lit += `\\${pattern[i]}`;
}
i += 1; i += 1;
} else { } else {
if (pattern[i] === '"') { literal += pattern.slice(i, i + 2);
lit += '\\'; i += 2;
} }
lit += pattern[i]; } else if (pattern[i] === '"') {
literal += '\\"';
i += 1; i += 1;
} } else if (!NON_LITERAL_SET.has(pattern[i]) &&
} (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
if (lit) { literal += pattern[i];
seq.push([`"${lit}"`, true]);
}
if (i < length && !NON_LITERAL_SET.has(pattern[i])) {
seq.push([`"${pattern[i]}"`, true]);
i += 1; i += 1;
} else {
break;
}
}
if (literal !== '') {
seq.push([literal, true]);
} }
} }
} }
@ -312,7 +312,7 @@ export class SchemaConverter {
return joinSeq(); return joinSeq();
}; };
return this._addRule(name, transform()[0]); return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
} }
_resolveRef(ref) { _resolveRef(ref) {

View file

@ -331,6 +331,45 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
)""" )"""
}); });
test({
SUCCESS,
"simple regexp",
R"""({
"type": "string",
"pattern": "^abc?d*efg+(hij)?kl$"
})""",
R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp escapes",
R"""({
"type": "string",
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
})""",
R"""(
root ::= "\"" "[]{}()|+*?" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp quote",
R"""({
"type": "string",
"pattern": "^\"$"
})""",
R"""(
root ::= "\"" "\"" "\"" space
space ::= " "?
)"""
});
test({ test({
SUCCESS, SUCCESS,
"regexp", "regexp",
@ -340,7 +379,7 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
})""", })""",
R"""( R"""(
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF] dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
root ::= ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
root-1 ::= [0-9] root-1 ::= [0-9]
space ::= " "? space ::= " "?
)""" )"""