grammars: x{min,max} repetition operator (#6640)

* grammars: x{min,max} repetition operator + tweak +/*/? to avoid duplication of original over alternates

* grammars: handle `x{n}` and fix `x{n,n}`

* grammars: document new repetition operators

* grammars: uniform use of int for min & max

* grammars: refactor parser test

* grammar: parsing tests w/ natural pretty print of updated expectations

* grammars: much prettier print of expectations (+ TEST_GRAMMAR_PARSER_PRINT_ALL=1 to force all)

* grammars: improve test pretty print again

* grammars: pretty print rules and chars

* grammars: fix copy rule skipping

* grammars: disallow `a{,}` (not allowed in regexps)

* Update common/grammar-parser.cpp

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* grammars: fix copy rule skipping (again) & display of expectations

* grammars: more test cases

* grammars: update reps parsing to bring ? / * / + closer to before

* json: use new GBNF repetitions{m,n} syntax

* grammars: update performance gotchas w/ repetition advice

* Update examples/json_schema_to_grammar.py

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* Update examples/server/public/json-schema-to-grammar.mjs

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* grammars: comment on rule repetitions

* grammars: ensure unambiguous number alternatives

* grammar: nit typo switched error msgs

* grammar: nit numbering in comment

* json: update numeric rule to be unambiguous

* Apply suggestions from code review

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* Update examples/server/public/json-schema-to-grammar.mjs

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* json: fix integral-part

* grammar: add repetition tests

---------

Co-authored-by: Clint Herron <hanclinto@gmail.com>
This commit is contained in:
Olivier Chafik 2024-06-06 10:07:06 +01:00 committed by GitHub
parent f5d7b268ec
commit 55b2d0849d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 736 additions and 418 deletions

View file

@ -2,57 +2,26 @@
const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
if (minItems === 0 && maxItems === 1) {
return `${itemRule}?`;
}
const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') {
if (minItems === 0 && maxItems === 1) {
return `${itemRule}?`;
} else if (minItems === 1 && maxItems === undefined) {
if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`;
}
}
let result = '';
if (minItems > 0) {
if (itemRuleIsLiteral && separatorRule === '') {
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
} else if (minItems === 0 && maxItems === undefined) {
return `${itemRule}*`;
} else {
result = Array.from({ length: minItems }, () => itemRule)
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
}
}
const optRepetitions = (upToN, prefixWithSep=false) => {
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
if (upToN === 0) {
return '';
} else if (upToN === 1) {
return `(${content})?`;
} else if (separatorRule !== '' && !prefixWithSep) {
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
} else {
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
}
};
if (minItems > 0 && maxItems !== minItems) {
result += ' ';
}
if (maxItems !== undefined) {
result += optRepetitions(maxItems - minItems, minItems > 0);
} else {
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
if (minItems === 0 && separatorRule !== '') {
result = `(${itemRule} ${itemOperator}*)?`;
} else {
result += `${itemOperator}*`;
}
}
return result;
const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
return minItems === 0 ? `(${result})?` : result;
}
class BuiltinRule {
@ -62,27 +31,25 @@ class BuiltinRule {
}
}
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
const PRIMITIVE_RULES = {
boolean : new BuiltinRule('("true" | "false") space', []),
'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};
// TODO: support "uri", "email" string formats
const STRING_FORMAT_RULES = {
'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),