json: fix string patterns (was missing quotes)

This commit is contained in:
ochafik 2024-03-18 04:06:23 +00:00
parent dd922a4da3
commit 24f0b941cf
7 changed files with 1579 additions and 1500 deletions

View file

@ -44,16 +44,17 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('{*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall):
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
self._prop_order = prop_order
self._allow_fetch = allow_fetch
self._dotall = dotall
self._raw_pattern = raw_pattern
self._rules = {'space': SPACE_RULE}
self._refs = {}
self._refs_being_resolved = set()
@ -152,6 +153,10 @@ class SchemaConverter:
i = 0
length = len(pattern)
def to_rule(s: Tuple[str, bool]) -> str:
(txt, is_literal) = s
return "\"" + txt + "\"" if is_literal else txt
def transform() -> Tuple[str, bool]:
'''
Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
@ -180,13 +185,12 @@ class SchemaConverter:
ret = []
for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
if is_literal:
lit = ''.join(x[0][1:-1] for x in g)
ret.append((f'"{lit}"', True))
ret.append((''.join(x[0] for x in g), True))
else:
ret.extend(g)
if len(ret) == 1:
return ret[0]
return (' '.join(x[0] for x in seq), False)
return (' '.join(to_rule(x) for x in seq), False)
while i < length:
c = pattern[i]
@ -197,7 +201,7 @@ class SchemaConverter:
i += 1
if i < length:
assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
seq.append((f'({transform()[0]})', False))
seq.append((f'({to_rule(transform())})', False))
elif c == ')':
i += 1
assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
@ -220,7 +224,7 @@ class SchemaConverter:
seq.append(('|', False))
i += 1
elif c in ('*', '+', '?'):
seq[-1] = (f'{seq[-1][0]}{c}', False)
seq[-1] = (to_rule(seq[-1]) + c, False)
i += 1
elif c == '{':
curly_brackets = c
@ -232,13 +236,18 @@ class SchemaConverter:
curly_brackets += '}'
i += 1
nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
if len(nums) == 1:
min_times = int(nums[0])
max_times = min_times
else:
assert len(nums) == 2
min_times = int(nums[0]) if nums[0] else 0
max_times = int(nums[1]) if nums[1] else None
min_times = 0
max_times = None
try:
if len(nums) == 1:
min_times = int(nums[0])
max_times = min_times
else:
assert len(nums) == 2
min_times = int(nums[0]) if nums[0] else 0
max_times = int(nums[1]) if nums[1] else None
except ValueError:
raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
(sub, sub_is_literal) = seq[-1]
@ -263,32 +272,35 @@ class SchemaConverter:
False
)
else:
lit = ''
while i < length and pattern[i] not in NON_LITERAL_SET \
and not (i < length - 1 and pattern[i+1] in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS):
literal = ''
while i < length:
if pattern[i] == '\\' and i < length - 1:
i += 1
if pattern[i] in NON_LITERAL_SET:
# Escapes in regular expressions that aren't escaped in GBNF literals
lit += pattern[i]
next = pattern[i + 1]
if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
i += 1
literal += pattern[i]
i += 1
else:
lit += f'\\{pattern[i]}'
literal += pattern[i:i+2]
i += 2
elif pattern[i] == '"' and not self._raw_pattern:
literal += '\\"'
i += 1
elif pattern[i] not in NON_LITERAL_SET and \
(i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
literal += pattern[i]
i += 1
else:
if pattern[i] == '"':
lit += '\\'
lit += pattern[i]
i += 1
if lit:
seq.append((f'"{lit}"', True))
if i < length and pattern[i] not in ('.', '(', ')', '|', '[', '{', '*', '+', '?'):
seq.append((f'"{pattern[i]}"', True))
i += 1
break
if literal:
seq.append((literal, True))
return join_seq()
return self._add_rule(name, transform()[0])
return self._add_rule(
name,
to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
def _resolve_ref(self, ref):
@ -510,6 +522,11 @@ def main(args_in = None):
action='store_true',
default=False,
help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
parser.add_argument(
'--raw-pattern',
action='store_true',
default=False,
help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
args = parser.parse_args(args_in)
@ -528,7 +545,8 @@ def main(args_in = None):
converter = SchemaConverter(
prop_order={name: idx for idx, name in enumerate(args.prop_order)},
allow_fetch=args.allow_fetch,
dotall=args.dotall)
dotall=args.dotall,
raw_pattern=args.raw_pattern)
schema = converter.resolve_refs(schema, url)
converter.visit(schema, '')
print(converter.format_grammar())

View file

@ -11,6 +11,7 @@ print(subprocess.check_output(
"json-schema-to-grammar.py"),
*rest,
"-",
"--raw-pattern",
],
text=True,
input=json.dumps({

View file

@ -62,7 +62,7 @@ unordered_map<char, string> GRAMMAR_LITERAL_ESCAPES = {
};
unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'{', '*', '+', '?'};
unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
string join(Iterator begin, Iterator end, const string& separator) {
@ -186,9 +186,15 @@ private:
size_t i = 0;
size_t length = sub_pattern.length();
std::function<pair<string, bool>()> transform = [&]() -> pair<string, bool> {
using literal_or_rule = pair<string, bool>;
auto to_rule = [&](const literal_or_rule& ls) {
auto is_literal = ls.second;
auto s = ls.first;
return is_literal ? "\"" + s + "\"" : s;
};
std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
size_t start = i;
vector<pair<string, bool>> seq;
vector<literal_or_rule> seq;
auto get_dot = [&]() {
string rule;
@ -202,28 +208,32 @@ private:
// Joins the sequence, merging consecutive literals together.
auto join_seq = [&]() {
vector<string> results;
vector<literal_or_rule> ret;
string literal;
auto flush_literal = [&]() {
if (literal.empty()) {
return false;
}
results.push_back("\"" + literal + "\"");
ret.push_back(make_pair(literal, true));
literal.clear();
return true;
};
for (const auto& item : seq) {
if (item.second) {
literal += item.first.substr(1, item.first.length() - 2);
auto is_literal = item.second;
if (is_literal) {
literal += item.first;
} else {
flush_literal();
results.push_back(item.first);
ret.push_back(item);
}
}
if (flush_literal() && results.size() == 1) {
return make_pair(results[0], true);
flush_literal();
vector<string> results;
for (const auto& item : ret) {
results.push_back(to_rule(item));
}
return make_pair(join(results.begin(), results.end(), " "), false);
};
@ -240,8 +250,7 @@ private:
_warnings.push_back("Unsupported pattern syntax");
}
}
auto sub_result = transform();
seq.push_back(make_pair("(" + sub_result.first + ")", false));
seq.push_back(make_pair("(" + to_rule(transform()) + ")", false));
} else if (c == ')') {
i++;
if (start > 0 && sub_pattern[start - 1] != '(') {
@ -270,7 +279,7 @@ private:
seq.push_back(make_pair("|", false));
i++;
} else if (c == '*' || c == '+' || c == '?') {
seq.back().first += c;
seq.back() = make_pair(to_rule(seq.back()) + c, false);
i++;
} else if (c == '{') {
string curly_brackets = string(1, c);
@ -287,17 +296,22 @@ private:
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0;
int max_times = numeric_limits<int>::max();
if (nums.size() == 1) {
min_times = max_times = stoi(nums[0]);
} else if (nums.size() != 2) {
_errors.push_back("Wrong number of values in curly brackets");
} else {
if (!nums[0].empty()) {
min_times = stoi(nums[0]);
}
if (!nums[1].empty()) {
max_times = stoi(nums[1]);
try {
if (nums.size() == 1) {
min_times = max_times = std::stoi(nums[0]);
} else if (nums.size() != 2) {
_errors.push_back("Wrong number of values in curly brackets");
} else {
if (!nums[0].empty()) {
min_times = std::stoi(nums[0]);
}
if (!nums[1].empty()) {
max_times = std::stoi(nums[1]);
}
}
} catch (const std::invalid_argument& e) {
_errors.push_back("Invalid number in curly brackets");
return make_pair("", false);
}
auto &last = seq.back();
auto &sub = last.first;
@ -346,36 +360,39 @@ private:
}
} else {
string literal;
while (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end() &&
(i == length - 1 || ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(sub_pattern[i + 1]) == ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end())) {
auto is_non_literal = [&](char c) {
return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
};
while (i < length) {
if (sub_pattern[i] == '\\' && i < length - 1) {
i++;
if (NON_LITERAL_SET.find(sub_pattern[i]) != NON_LITERAL_SET.end()) {
char next = sub_pattern[i + 1];
if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
i++;
literal += sub_pattern[i];
i++;
} else {
literal += "\\" + string(1, sub_pattern[i]);
literal += sub_pattern.substr(i, 2);
i += 2;
}
} else if (sub_pattern[i] == '"') {
literal += "\\\"";
i++;
} else {
if (sub_pattern[i] == '"') {
literal += "\\";
}
} else if (!is_non_literal(sub_pattern[i]) &&
(i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
literal += sub_pattern[i];
i++;
} else {
break;
}
}
if (!literal.empty()) {
seq.push_back(make_pair("\"" + literal + "\"", true));
}
if (i < length && NON_LITERAL_SET.find(sub_pattern[i]) == NON_LITERAL_SET.end()) {
seq.push_back(make_pair("\"" + string(1, sub_pattern[i]) + "\"", true));
i++;
seq.push_back(make_pair(literal, true));
}
}
}
return join_seq();
};
return _add_rule(name, transform().first);
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
}
string _resolve_ref(const string& ref) {

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -35,7 +35,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
const NON_LITERAL_SET = new Set('|.()[]{}*+?');
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('{*+?');
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
export class SchemaConverter {
constructor(options) {
@ -163,6 +163,9 @@ export class SchemaConverter {
return this._addRule('dot', rule);
};
const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
const transform = () => {
const start = i;
// For each component of this sequence, store its string representation and whether it's a literal.
@ -175,8 +178,7 @@ export class SchemaConverter {
const ret = [];
for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
if (isLiteral) {
const lit = [...g].map(x => x[0].slice(1, -1)).join('');
ret.push([`"${lit}"`, true]);
ret.push([[...g].map(x => x[0]).join(''), true]);
} else {
ret.push(...g);
}
@ -184,7 +186,7 @@ export class SchemaConverter {
if (ret.length === 1) {
return ret[0];
}
return [ret.map(x => x[0]).join(' '), false];
return [ret.map(x => toRule(x)).join(' '), false];
};
while (i < length) {
@ -199,7 +201,7 @@ export class SchemaConverter {
throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
}
}
seq.push([`(${transform()[0]})`, false]);
seq.push([`(${toRule(transform())})`, false]);
} else if (c === ')') {
i += 1;
if (start <= 0 || pattern[start - 1] !== '(') {
@ -228,7 +230,7 @@ export class SchemaConverter {
seq.push(['|', false]);
i += 1;
} else if (c === '*' || c === '+' || c === '?') {
seq[seq.length - 1] = [`${seq[seq.length - 1][0]}${c}`, false];
seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
i += 1;
} else if (c === '{') {
let curlyBrackets = c;
@ -278,33 +280,31 @@ export class SchemaConverter {
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
}
} else {
let lit = '';
while (i < length && !NON_LITERAL_SET.has(pattern[i]) &&
!(i < length - 1 && ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(pattern[i + 1]))) {
let literal = '';
while (i < length) {
if (pattern[i] === '\\' && i < length - 1) {
i += 1;
if (NON_LITERAL_SET.has(pattern[i])) {
// Escapes in regular expressions that aren't escaped in GBNF literals
lit += pattern[i];
const next = pattern[i + 1];
if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
i += 1;
literal += pattern[i];
i += 1;
} else {
lit += `\\${pattern[i]}`;
literal += pattern.slice(i, i + 2);
i += 2;
}
} else if (pattern[i] === '"') {
literal += '\\"';
i += 1;
} else if (!NON_LITERAL_SET.has(pattern[i]) &&
(i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
literal += pattern[i];
i += 1;
} else {
if (pattern[i] === '"') {
lit += '\\';
}
lit += pattern[i];
i += 1;
break;
}
}
if (lit) {
seq.push([`"${lit}"`, true]);
}
if (i < length && !NON_LITERAL_SET.has(pattern[i])) {
seq.push([`"${pattern[i]}"`, true]);
i += 1;
if (literal !== '') {
seq.push([literal, true]);
}
}
}
@ -312,7 +312,7 @@ export class SchemaConverter {
return joinSeq();
};
return this._addRule(name, transform()[0]);
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
}
_resolveRef(ref) {

View file

@ -331,6 +331,45 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
)"""
});
test({
SUCCESS,
"simple regexp",
R"""({
"type": "string",
"pattern": "^abc?d*efg+(hij)?kl$"
})""",
R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp escapes",
R"""({
"type": "string",
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
})""",
R"""(
root ::= "\"" "[]{}()|+*?" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp quote",
R"""({
"type": "string",
"pattern": "^\"$"
})""",
R"""(
root ::= "\"" "\"" "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"regexp",
@ -340,7 +379,7 @@ static void test_all(const string& lang, std::function<void(const TestCase&)> ru
})""",
R"""(
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
root ::= ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot
root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
root-1 ::= [0-9]
space ::= " "?
)"""