llama : use new pre-tokenizer type

This commit is contained in:
Georgi Gerganov 2024-04-26 20:08:28 +03:00
parent 9b4d63ae53
commit 43e12ce8e5
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
12 changed files with 87 additions and 44 deletions

View file

@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp)
llama_test(test-sampling.cpp)
llama_test(test-chat-template.cpp)
# TODO: tmp disabled LLaMA v3 and Deepseek tests
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
#llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)

View file

@ -27,6 +27,8 @@ tests = [
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 466, }, },
{ "\t" , { 192, }, },
{ "\n" , { 193, }, },
{ "\n\n" , { 1001, }, },
{ "\n\n\n" , { 11331, }, },
{ "\t\n" , { 19125, }, },
{ "Hello world" , { 9856, 1079, }, },
{ " Hello world" , { 23090, 1079, }, },

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 262, }, },
{ "\t" , { 197, }, },
{ "\n" , { 198, }, },
{ "\n\n" , { 271, }, },
{ "\n\n\n" , { 1432, }, },
{ "\t\n" , { 1602, }, },
{ "Hello world" , { 9906, 1917, }, },
{ " Hello world" , { 22691, 1917, }, },

View file

@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " " , { 268, }, },
{ "\t" , { 29871, 12, }, },
{ "\n" , { 29871, 13, }, },
{ "\n\n" , { 29871, 13, 13, }, },
{ "\n\n\n" , { 29871, 13, 13, 13, }, },
{ "\t\n" , { 29871, 12, 13, }, },
{ "Hello world" , { 15043, 3186, }, },
{ " Hello world" , { 29871, 15043, 3186, }, },

View file

@ -27,6 +27,8 @@ tests = [
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",