starcoder : fix pre-tokenizer

This commit is contained in:
Georgi Gerganov 2024-05-02 11:00:15 +03:00
parent 7053b261ab
commit cf00fe1ea3
No known key found for this signature in database
GPG key ID: BF970631944C16B7
22 changed files with 26 additions and 20 deletions

View file

@ -189,6 +189,7 @@ print("\n")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months"
"",
" ",
" ",

View file

@ -12235,6 +12235,11 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
29464 2094 1018 1092 2706

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
1050 207 19 207 19192 4217
207
243
315

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
1052 207 19 207 19109 4223
207
243
300

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
878 204 31 3068 133 2137
204
258
466

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
798 604 25208 1933
220
220 220
220 220 220

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
1142 220 19 220 27154 4038
220
256
262

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
474 287 29871 29946 29871 30226 7378
259
1678
268

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
728 577 24142 2607
209
50276
50275

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
474 287 29871 29946 29871 30226 7378
259
1678
268

View file

@ -1,4 +1,4 @@
ied 4 ½ months
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,4 @@
4850 244 57 244 162 159 17722
244
280
283