llama : use new pre-tokenizer type

2024-04-26 20:08:28 +03:00 · 2024-04-26 20:08:28 +03:00 · 43e12ce8e5
commit 43e12ce8e5
parent 9b4d63ae53
12 changed files with 87 additions and 44 deletions
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp)
 llama_test(test-sampling.cpp)
 llama_test(test-chat-template.cpp)

-# TODO: tmp disabled LLaMA v3 and Deepseek tests
 llama_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
+llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
 llama_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)

-#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-#llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)

 llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@ -27,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
+    "\n\n",
+    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     466, }, },
        { "\t"                    , {     192, }, },
        { "\n"                    , {     193, }, },
+        { "\n\n"                  , {    1001, }, },
+        { "\n\n\n"                , {   11331, }, },
        { "\t\n"                  , {   19125, }, },
        { "Hello world"           , {    9856,   1079, }, },
        { " Hello world"          , {   23090,   1079, }, },
--- a/tests/test-tokenizer-0-llama-v3.cpp
+++ b/tests/test-tokenizer-0-llama-v3.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     262, }, },
        { "\t"                    , {     197, }, },
        { "\n"                    , {     198, }, },
+        { "\n\n"                  , {     271, }, },
+        { "\n\n\n"                , {    1432, }, },
        { "\t\n"                  , {    1602, }, },
        { "Hello world"           , {    9906,   1917, }, },
        { " Hello world"          , {   22691,   1917, }, },
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     268, }, },
        { "\t"                    , {   29871,     12, }, },
        { "\n"                    , {   29871,     13, }, },
+        { "\n\n"                  , {   29871,     13,     13, }, },
+        { "\n\n\n"                , {   29871,     13,     13,     13, }, },
        { "\t\n"                  , {   29871,     12,     13, }, },
        { "Hello world"           , {   15043,   3186, }, },
        { " Hello world"          , {   29871,  15043,   3186, }, },
--- a/tests/test-tokenizer-0-spm.py
+++ b/tests/test-tokenizer-0-spm.py
@ -27,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
+    "\n\n",
+    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",