Add llama_tokens_to_string() to utils.cpp

- Also single token converter
2023-03-17 18:46:27 -07:00 · 2023-03-17 18:46:27 -07:00 · 05224ed472
commit 05224ed472
parent 912e6246d6
2 changed files with 15 additions and 0 deletions
--- a/utils.cpp
+++ b/utils.cpp
@ -347,6 +347,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st

    return res;
 }
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
+    std::string res;
+    for (auto t : tokens) {
+        res += vocab.id_to_token.at(t);
+    }
+    return res;
+}
+std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
+    return vocab.id_to_token.at(tokens);
+}

 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
--- a/utils.h
+++ b/utils.h
@ -81,6 +81,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 // ref: https://github.com/google/sentencepiece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);

+// convert tokens to string
+// opposite llama_tokenize
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
+inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
+
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);