From 39c9a3b55339b4167de1b27ab2c9e0b6ba686e6d Mon Sep 17 00:00:00 2001 From: goerch Date: Mon, 24 Jul 2023 10:20:25 +0200 Subject: [PATCH] Added test cases --- tests/test-tokenizer-0.cpp | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index dca3e72c7..9d7b5b348 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -17,19 +17,28 @@ static std::string unescape_whitespace(llama_context* ctx, const llama_token* to static const std::map> & k_tests() { static std::map> _k_tests = { - { " ", {1, 259, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, - { "Hello world", { 1, 15043, 3186, }, }, - { " Hello world", { 1, 29871, 15043, 3186, }, }, - { "Hello World", { 1, 15043, 2787, }, }, - { " Hello World", { 1, 29871, 15043, 2787, }, }, - {" Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, - {" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - {"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - {"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, - {"How are you?", { 1, 1128, 526, 366, 29973, }, }, + { " ", {1, 259, }, }, + { "\t", { 1, 29871, 12, }, }, + { "\n", { 1, 29871, 13, }, }, + { "\t\n", { 1, 29871, 12, 13, }, }, + { "Hello world", { 1, 15043, 3186, }, }, + { " Hello world", { 1, 29871, 15043, 3186, }, }, + { "Hello World", { 1, 15043, 2787, }, }, + { " Hello World", { 1, 29871, 15043, 2787, }, }, + { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, + { " this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "កាន់តែពិសេសអាចខលចេញ", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, + 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, + 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, + 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, + 136, 228, 162, 132, 228, 161, 140, }, }, + { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, + 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, + 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, + 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, }; return _k_tests; };