From 2ec25dbf27aa21053ee8db417595de734a7dcfe7 Mon Sep 17 00:00:00 2001 From: zhenweijin Date: Wed, 11 Sep 2024 09:42:55 +0800 Subject: [PATCH] refactor tokenizer --- tests/CMakeLists.txt | 19 ++++ tests/test-tokenizer-parallel.cpp | 180 ++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 tests/test-tokenizer-parallel.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 08ad66b49..8f991477c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -84,6 +84,25 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) +# build test-tokenizer-parallel target once and add many tests +add_executable(test-tokenizer-parallel test-tokenizer-parallel.cpp) +target_link_libraries(test-tokenizer-parallel PRIVATE common) +install(TARGETS test-tokenizer-parallel RUNTIME) + +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test(test-tokenizer-parallel NAME test-tokenizer-parallel-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) + # build test-tokenizer-1-bpe target once and add many tests add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) target_link_libraries(test-tokenizer-1-bpe PRIVATE common) diff --git a/tests/test-tokenizer-parallel.cpp b/tests/test-tokenizer-parallel.cpp new file mode 100644 index 000000000..cead81079 --- /dev/null +++ b/tests/test-tokenizer-parallel.cpp @@ -0,0 +1,180 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include +#include + +using llama_tests = std::map>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; + + std::ifstream ifs_inp(fname_inp); + if (!ifs_inp) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str()); + return tests; + } + + std::string sraw((std::istreambuf_iterator(ifs_inp)), std::istreambuf_iterator()); + + std::ifstream ifs_out(fname_out); + if (!ifs_out) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return tests; + } + + std::vector sout; + for (std::string line; std::getline(ifs_out, line);) { + sout.push_back(line); + } + + const std::string sep = "\n__ggml_vocab_test__\n"; + + std::vector sinp; + + size_t pos = 0; + while (pos < sraw.size()) { + const size_t next = sraw.find(sep, pos); + if (next == std::string::npos) { + sinp.push_back(sraw.substr(pos)); + break; + } + sinp.push_back(sraw.substr(pos, next - pos)); + pos = next + sep.size(); + } + + if (sinp.size() != sout.size()) { + fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__); + return tests; + } + + for (size_t i = 0; i < sinp.size(); ++i) { + const std::string & s = sinp[i]; + const std::string & o = string_strip(sout[i]); + + std::vector toks; + + size_t pos = 0; + while (pos < o.size()) { + size_t next = o.find(' ', pos); + if (next == std::string::npos) { + next = o.size(); + } + const std::string stok = o.substr(pos, next - pos); + toks.push_back(std::stoi(stok)); + pos = next + 1; + } + + tests[s] = toks; + } + + return tests; +} + +int main(int argc, char const *argv[]) { + + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file \n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + const std::string fname_inp = fname + ".inp"; + const std::string fname_out = fname + ".out"; + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(); + + // load the vocab + { + auto mparams = llama_model_default_params(); + + mparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + + ctx = llama_new_context_with_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + const int nthread = std::thread::hardware_concurrency(); + std::vector threads(nthread); + + bool success = true; + + const auto k_tests = [&]() -> llama_tests { + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); + + const bool add_special = false; + + for (int i = 0; i < nthread; i++) { + threads[i] = std::thread([&]() { + for (const auto & test_kv : k_tests) { + const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + success = false; + } + } + }); + } + + for (int i = 0; i < nthread; i++) { + threads[i].join(); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + printf("\n"); + printf("Tests %s\n", success ? "passed" : "failed"); + + return success ? 0 : 3; +} + +