From 05224ed4725314715700be37087b2a6be0fffe96 Mon Sep 17 00:00:00 2001
From: Thomas Antony <mail@thomasantony.com>
Date: Fri, 17 Mar 2023 18:46:27 -0700
Subject: [PATCH] Add llama_tokens_to_string() to utils.cpp

- Also single token converter
---
 utils.cpp | 10 ++++++++++
 utils.h   |  5 +++++
 2 files changed, 15 insertions(+)

diff --git a/utils.cpp b/utils.cpp
index 08d5c6ba6..f810ad01f 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -347,6 +347,16 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::st
 
     return res;
 }
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens) {
+    std::string res;
+    for (auto t : tokens) {
+        res += vocab.id_to_token.at(t);
+    }
+    return res;
+}
+std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) {
+    return vocab.id_to_token.at(tokens);
+}
 
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
diff --git a/utils.h b/utils.h
index 49658f7d9..3a1cc58a0 100644
--- a/utils.h
+++ b/utils.h
@@ -81,6 +81,11 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
 // ref: https://github.com/google/sentencepiece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
 
+// convert tokens to string
+// opposite llama_tokenize
+std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector<gpt_vocab::id> & tokens);
+inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens);
+
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);