vocab : refactor tokenizer to reduce init overhead (#9449)

* refactor tokenizer * llama : make llm_tokenizer more private ggml-ci * refactor tokenizer * refactor tokenizer * llama : make llm_tokenizer more private ggml-ci * remove unused files * remove unused fileds to avoid unused filed build error * avoid symbol link error * Update src/llama.cpp * Update src/llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-09-28 20:10:58 +08:00 · 2024-09-28 20:10:58 +08:00 · 6102037bbb
commit 6102037bbb
parent 9a913110cf
5 changed files with 238 additions and 141 deletions
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -8,6 +8,8 @@
 #include <map>
 #include <set>

+struct llm_tokenizer;
+
 struct llama_vocab {
    using id    = llama_token;
    using token = std::string;
@ -65,7 +67,14 @@ struct llama_vocab {

    std::vector<char> precompiled_charsmap;

+    llm_tokenizer * tokenizer = nullptr;
+
+    llama_vocab() = default;
+    ~llama_vocab();
+
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    void init_tokenizer();
 };

 //