sentencepiece bpe compatible tokenizer (#252)
* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
This commit is contained in:
parent
5cb63e2493
commit
074bea2eb1
7 changed files with 180 additions and 44 deletions
3
utils.h
3
utils.h
|
@ -58,6 +58,7 @@ struct gpt_vocab {
|
|||
|
||||
std::map<token, id> token_to_id;
|
||||
std::map<id, token> id_to_token;
|
||||
std::map<id, float> score;
|
||||
};
|
||||
|
||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||
|
@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||
|
||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||
// ref: https://github.com/google/sentencepiece
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
|
||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
|
||||
|
||||
// load the tokens from encoder.json
|
||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue