tests : add test that fails with DeepSeek tokenizers

This commit is contained in:
Georgi Gerganov 2024-05-02 11:46:20 +03:00
parent cf00fe1ea3
commit 3a461dbff3
No known key found for this signature in database
GPG key ID: BF970631944C16B7
22 changed files with 67 additions and 5 deletions

View file

@ -189,7 +189,8 @@ print("\n")
# generate tests for each tokenizer model # generate tests for each tokenizer model
tests = [ tests = [
"ied 4 ½ months" "ied 4 ½ months",
"Führer",
"", "",
" ", " ",
" ", " ",

View file

@ -17471,9 +17471,10 @@ int32_t llama_tokenize(
static std::string llama_decode_text(const std::string & text) { static std::string llama_decode_text(const std::string & text) {
std::string decoded_text; std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) { const auto cpts = unicode_cpts_from_utf8(text);
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
} }
return decoded_text; return decoded_text;

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
29464 2094 1018 1092 2706 29464 2094 1018 1092 2706
11865 17875

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1050 207 19 207 19192 4217 1050 207 19 207 19192 4217
37 32009 71 6247
207 207
243 243
315 315

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1052 207 19 207 19109 4223 1052 207 19 207 19109 4223
37 100014 71 6245
207 207
243 243
300 300

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
878 204 31 3068 133 2137 878 204 31 3068 133 2137
28611 132 30042
204 204
258 258
466 466

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
798 604 25208 1933 798 604 25208 1933
37 9116 71 11751
220 220
220 220 220 220
220 220 220 220 220 220

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1142 220 19 220 27154 4038 1142 220 19 220 27154 4038
37 51853 261
220 220
256 256
262 262

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
474 287 29871 29946 29871 30226 7378 474 287 29871 29946 29871 30226 7378
383 4000 261
259 259
1678 1678
268 268

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
728 577 24142 2607 728 577 24142 2607
39 26288 6554
209 209
50276 50276
50275 50275

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
474 287 29871 29946 29871 30226 7378 474 287 29871 29946 29871 30226 7378
383 4000 261
259 259
1678 1678
268 268

View file

@ -1,4 +1,8 @@
ied 4 ½ months ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__
__ggml_vocab_test__ __ggml_vocab_test__

View file

@ -1,4 +1,6 @@
4850 244 57 244 162 159 17722 4850 244 57 244 162 159 17722
75 2022 3943 284
244 244
280 280
283 283