Add llama_tokens_to_string() to utils.cpp

- Also single token converter
This commit is contained in:
Thomas Antony 2023-03-17 18:46:27 -07:00
parent 912e6246d6
commit 05224ed472
2 changed files with 15 additions and 0 deletions

View file

@ -347,6 +347,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
return res;
}
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
std::string res;
for (auto t : tokens) {
res += vocab.id_to_token.at(t);
}
return res;
}
std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
return vocab.id_to_token.at(tokens);
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());

View file

@ -81,6 +81,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
// ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
// convert tokens to string
// opposite llama_tokenize
std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);