Add alternative regex for custom aplit llama3
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1d8fcc06ba
commit
2cd1eb0daa
1 changed files with 4 additions and 1 deletions
|
@ -532,7 +532,10 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||||
|
|
||||||
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
||||||
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
||||||
} else if (regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
} else if (
|
||||||
|
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
||||||
|
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||||
|
|
||||||
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue