diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index aabac5bb0..3e8877563 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -36,9 +36,19 @@ static const std::map> & k_tests() { { " Hello" , { 258, 23090, }, }, { " Hello" , { 466, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + { " (" , { 204, 19, }, }, { "\n =" , { 1212, 40, }, }, { "' era" , { 18, 4932, }, }, { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 9856, 23, 291, 18, 436, 12, 1265, 362, 299, 8196, 207, 204, 42, 50087, 123, 2727, 20300, 32022, 133, 234, 17419, 30137, 28, 7858, 181, 133, 236, }, }, + { "3" , { 30, }, }, + { "33" , { 3138, }, }, + { "333" , { 22287, }, }, + { "3333" , { 22287, 30, }, }, + { "33333" , { 22287, 3138, }, }, + { "333333" , { 22287, 22287, }, }, + { "3333333" , { 22287, 22287, 30, }, }, + { "33333333" , { 22287, 22287, 3138, }, }, + { "333333333" , { 22287, 22287, 22287, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index b99840e1b..b1cf43b50 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -39,9 +39,19 @@ tests = [ " Hello", " Hello", " Hello\n Hello", + " (", "\n =", "' era", "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + "33333333", + "333333333", ] for text in tests: diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 0a16cd7eb..e6b13ab91 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -36,7 +36,19 @@ static const std::map> & k_tests() { { " Hello" , { 1678, 15043, }, }, { " Hello" , { 268, 15043, }, }, { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, - { " (" , { 29871, 313, }, }, + { " (" , { 29871, 313, }, }, + { "\n =" , { 29871, 13, 353, }, }, + { "' era" , { 525, 3152, }, }, + { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 15043, 29892, 343, 29915, 497, 29991, 1128, 526, 366, 29871, 243, 162, 155, 132, 1577, 30672, 31522, 30505, 11548, 31041, 30732, 29896, 29941, 29896, 29946, 29896, 29945, 29896, 30408, 30739, }, }, + { "3" , { 29871, 29941, }, }, + { "33" , { 29871, 29941, 29941, }, }, + { "333" , { 29871, 29941, 29941, 29941, }, }, + { "3333" , { 29871, 29941, 29941, 29941, 29941, }, }, + { "33333" , { 29871, 29941, 29941, 29941, 29941, 29941, }, }, + { "333333" , { 29871, 29941, 29941, 29941, 29941, 29941, 29941, }, }, + { "3333333" , { 29871, 29941, 29941, 29941, 29941, 29941, 29941, 29941, }, }, + { "33333333" , { 29871, 29941, 29941, 29941, 29941, 29941, 29941, 29941, 29941, }, }, + { "333333333" , { 29871, 29941, 29941, 29941, 29941, 29941, 29941, 29941, 29941, 29941, }, }, }; return _k_tests; diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index f3d4d7e3d..6dbfd9412 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -39,6 +39,19 @@ tests = [ " Hello", " Hello", " Hello\n Hello", + " (", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + "33333333", + "333333333", ]