json: optimize repetitions for minItems/maxItems and regexps: a{,3} goes from "a"? "a"? "a"? (explosive combos) to (a (a (a)?)?)?

This commit is contained in:
ochafik 2024-04-08 20:02:18 +01:00
parent 159b883bd4
commit a59e9431fc
5 changed files with 1730 additions and 1659 deletions

View file

@ -11,6 +11,23 @@
using json = nlohmann::ordered_json;
static std::string build_repetition(const std::string & content, int upToN) {
std::ostringstream out;
std::function<void(int)> aux = [&](int n) {
if (n == 0) {
return;
}
out << "(" << content;
if (n > 1) {
out << " ";
aux(n - 1);
}
out << ")?";
};
aux(upToN);
return out.str();
}
const std::string SPACE_RULE = "\" \"?";
struct BuiltinRule {
@ -343,12 +360,7 @@ private:
if (max_times == std::numeric_limits<int>::max()) {
result += sub + "*";
} else {
for (int j = min_times; j < max_times; j++) {
if (j > min_times) {
result += " ";
}
result += sub + "?";
}
result += build_repetition(sub, max_times - min_times);
}
seq.back().first = result;
seq.back().second = false;
@ -680,7 +692,7 @@ public:
min_items--;
}
if (max_items >= 0 && max_items > min_items) {
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
successive_items += build_repetition(list_item_operator, max_items - min_items - 1);
} else {
successive_items += list_item_operator + "*";
}

View file

@ -6,6 +6,12 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(content, up_to_n):
# return ' '.join([content] * n)
if up_to_n == 0:
return ''
return f'({content}{" " + _build_repetition(content, up_to_n-1) if up_to_n > 1 else ""})?'
class BuiltinRule:
def __init__(self, content: str, deps: list[str] = None):
self.content = content
@ -277,10 +283,13 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1]
if min_times == 0 and max_times is None:
sub = f'"{sub}"' if sub_is_literal else sub
seq[-1] = (f'{sub}*', False)
elif min_times == 0 and max_times == 1:
sub = f'"{sub}"' if sub_is_literal else sub
seq[-1] = (f'{sub}?', False)
elif min_times == 1 and max_times is None:
sub = f'"{sub}"' if sub_is_literal else sub
seq[-1] = (f'{sub}+', False)
else:
if not sub_is_literal:
@ -290,12 +299,17 @@ class SchemaConverter:
sub_rule_ids[sub] = id
sub = id
seq[-1] = (
' '.join(
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
False
)
if sub_is_literal and min_times > 0:
result = '"' + (sub[1:-1] * min_times) + '"'
else:
result = ' '.join([sub] * min_times)
if min_times < max_times:
if min_times > 0:
result += ' '
result += _build_repetition(sub, max_times - min_times)
seq[-1] = (result, False)
else:
literal = ''
while i < length:
@ -411,7 +425,7 @@ class SchemaConverter:
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
successive_items += _build_repetition(list_item_operator, max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
if min_items == 0:

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,13 @@
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
const SPACE_RULE = '" "?';
function _buildRepetition(content, upToN) {
if (upToN === 0) {
return '';
}
return `(${content}${upToN > 1 ? ` ${_buildRepetition(content, upToN - 1)}` : ''})?`;
}
class BuiltinRule {
constructor(content, deps) {
this.content = content;
@ -281,9 +288,20 @@ export class SchemaConverter {
sub = id;
}
const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
let result;
if (subIsLiteral && minTimes > 0) {
result = `"${sub.slice(1, -1).repeat(minTimes)}"`;
} else {
result = Array.from({ length: minTimes }, () => sub).join(' ');
}
if (minTimes < maxTimes) {
if (minTimes > 0) {
result += ' ';
}
result += _buildRepetition(sub, maxTimes - minTimes);
}
seq[seq.length - 1] = [result, false];
}
} else {
let literal = '';
@ -409,7 +427,7 @@ export class SchemaConverter {
minItems--;
}
if (maxItems !== undefined && maxItems > minItems) {
successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
successiveItems += _buildRepetition(listItemOperator, maxItems - minItems - 1);
} else {
successiveItems += `${listItemOperator}*`;
}

View file

@ -307,7 +307,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space ( boolean ( "," space boolean )? )? "]" space
root ::= "[" space ( boolean (( "," space boolean ))? )? "]" space
space ::= " "?
)"""
});
@ -326,7 +326,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space
item ::= number | integer
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space
root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space
root ::= "[" space item ( "," space item )( "," space item )(( "," space item ) (( "," space item ))?)? "]" space
space ::= " "?
)"""
});
@ -379,7 +379,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF]
root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
root ::= "\"" ("(" root-1 (root-1 (root-1)?)? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space
root-1 ::= [0-9]
space ::= " "?
)"""