llama : add grammar-based sampling (#1773)

* llama, main : constrain sampling to grammar

* allow loading grammar from file

* fix whitespace errors

* handle & print parser errors

* add comments to grammar syntax and allow newlines where unambiguous

* add missing include

* support alternates in root rule

* fix bugs with empty token and EOS

* adjust JSON grammar

* remove swp file

* rewrite ternary expressions

Co-authored-by: Henri Vasserman <henv@hot.ee>

* use struct for grammar elements and add Unicode support

* add unicode escapes

* add inverse char ranges

* only sample full tokens (no peeking or truncation)

* llama : minor style changes

blindly applied in online editor - hopefully I didn't break something

* update help text

* add warning message if EOS is disabled

---------

Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Evan Jones 2023-07-23 23:58:10 -04:00 committed by GitHub
parent 2f9cf974a0
commit 84e09a7d8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 977 additions and 1 deletions

6
grammars/arithmetic.gbnf Normal file
View file

@ -0,0 +1,6 @@
root ::= (expr "=" ws term "\n")+
expr ::= term ([-+*/] term)*
term ::= ident | num | "(" ws expr ")" ws
ident ::= [a-z] [a-z0-9_]* ws
num ::= [0-9]+ ws
ws ::= [ \t\n]*

13
grammars/chess.gbnf Normal file
View file

@ -0,0 +1,13 @@
# Specifies chess moves as a list in algebraic notation, using PGN conventions
# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
root ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
move ::= (pawn | nonpawn | castle) [+#]?
# piece type, optional file/rank, optional capture, dest file & rank
nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
# optional file & capture, dest file & rank, optional promotion
pawn ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
castle ::= "O-O" "-O"?

7
grammars/japanese.gbnf Normal file
View file

@ -0,0 +1,7 @@
# A probably incorrect grammar for Japanese
root ::= jp-char+ ([ \t\n] jp-char+)*
jp-char ::= hiragana | katakana | punctuation | cjk
hiragana ::= [ぁ-ゟ]
katakana ::= [ァ-ヿ]
punctuation ::= [、-〾]
cjk ::= [一-鿿]

29
grammars/json.gbnf Normal file
View file

@ -0,0 +1,29 @@
# Grammar for subset of JSON - doesn't support full string or number syntax
root ::= object
value ::= object | array | string | number | boolean | "null"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}"
array ::=
"[" ws (
value
("," ws value)*
)? "]"
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
# Only plain integers currently
number ::= "-"? [0-9]+ ws
boolean ::= ("true" | "false") ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

4
grammars/list.gbnf Normal file
View file

@ -0,0 +1,4 @@
root ::= item+
# Excludes various line break characters
item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"