llama3 custom regex split: fix \s
This commit is contained in:
parent
8fd849eb90
commit
67832e5554
2 changed files with 17 additions and 13 deletions
|
@ -144,12 +144,13 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
|
|||
]
|
||||
|
||||
more_tests = [
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||
'½-a', # unicode_ranges_digit, 0x00BD
|
||||
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||
]
|
||||
|
||||
for text in tests+more_tests:
|
||||
|
|
17
unicode.cpp
17
unicode.cpp
|
@ -281,6 +281,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
}
|
||||
}
|
||||
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
// regex: <space>?\p{L}+
|
||||
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||
|
@ -301,17 +302,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|||
continue;
|
||||
}
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||
if (cpt2_type != CODEPOINT_TYPE_SEPARATOR && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type != CODEPOINT_TYPE_SEPARATOR && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_SEPARATOR) {
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
|
@ -424,13 +426,14 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||
if (cpt2_type != CODEPOINT_TYPE_SEPARATOR && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
pos += (cpt == ' ');
|
||||
while (cpt2_type != CODEPOINT_TYPE_SEPARATOR && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||
cpt2_type = _get_cpt_type(++pos);
|
||||
cpt2 = _get_cpt(pos);
|
||||
}
|
||||
char32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
|
@ -440,7 +443,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_cpt_type(pos+num_whitespaces) == CODEPOINT_TYPE_SEPARATOR) {
|
||||
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue