vocab : refactor tokenizer to reduce init overhead (#9449)
* refactor tokenizer * llama : make llm_tokenizer more private ggml-ci * refactor tokenizer * refactor tokenizer * llama : make llm_tokenizer more private ggml-ci * remove unused files * remove unused fileds to avoid unused filed build error * avoid symbol link error * Update src/llama.cpp * Update src/llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
9a913110cf
commit
6102037bbb
5 changed files with 238 additions and 141 deletions
|
@ -8,6 +8,8 @@
|
|||
#include <map>
|
||||
#include <set>
|
||||
|
||||
struct llm_tokenizer;
|
||||
|
||||
struct llama_vocab {
|
||||
using id = llama_token;
|
||||
using token = std::string;
|
||||
|
@ -65,7 +67,14 @@ struct llama_vocab {
|
|||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
llm_tokenizer * tokenizer = nullptr;
|
||||
|
||||
llama_vocab() = default;
|
||||
~llama_vocab();
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||
|
||||
void init_tokenizer();
|
||||
};
|
||||
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue