tests : add test that fails with DeepSeek tokenizers

This commit is contained in:
Georgi Gerganov 2024-05-02 11:46:20 +03:00
parent cf00fe1ea3
commit 3a461dbff3
No known key found for this signature in database
GPG key ID: BF970631944C16B7
22 changed files with 67 additions and 5 deletions

View file

@ -189,7 +189,8 @@ print("\n")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months"
"ied 4 ½ months",
"Führer",
"",
" ",
" ",

View file

@ -17471,9 +17471,10 @@ int32_t llama_tokenize(
static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}
return decoded_text;

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
29464 2094 1018 1092 2706
11865 17875

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1050 207 19 207 19192 4217
37 32009 71 6247
207
243
315

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1052 207 19 207 19109 4223
37 100014 71 6245
207
243
300

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
878 204 31 3068 133 2137
28611 132 30042
204
258
466

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
798 604 25208 1933
37 9116 71 11751
220
220 220
220 220 220

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
1142 220 19 220 27154 4038
37 51853 261
220
256
262

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
474 287 29871 29946 29871 30226 7378
383 4000 261
259
1678
268

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
728 577 24142 2607
39 26288 6554
209
50276
50275

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
474 287 29871 29946 29871 30226 7378
383 4000 261
259
1678
268

View file

@ -1,4 +1,8 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,4 +1,6 @@
4850 244 57 244 162 159 17722
75 2022 3943 284
244
280
283