vocab : refactor tokenizer to reduce init overhead (#9449)

* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* refactor tokenizer

* refactor tokenizer

* llama : make llm_tokenizer more private

ggml-ci

* remove unused files

* remove unused fileds to avoid unused filed build error

* avoid symbol link error

* Update src/llama.cpp

* Update src/llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Zhenwei Jin 2024-09-28 20:10:58 +08:00 committed by GitHub
parent 9a913110cf
commit 6102037bbb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 238 additions and 141 deletions

View file

@ -8,6 +8,8 @@
#include <map>
#include <set>
struct llm_tokenizer;
struct llama_vocab {
using id = llama_token;
using token = std::string;
@ -65,7 +67,14 @@ struct llama_vocab {
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
};
//