From 05224ed4725314715700be37087b2a6be0fffe96 Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Fri, 17 Mar 2023 18:46:27 -0700 Subject: [PATCH] Add llama_tokens_to_string() to utils.cpp - Also single token converter --- utils.cpp | 10 ++++++++++ utils.h | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/utils.cpp b/utils.cpp index 08d5c6ba6..f810ad01f 100644 --- a/utils.cpp +++ b/utils.cpp @@ -347,6 +347,16 @@ std::vector llama_tokenize(const gpt_vocab & vocab, const std::st return res; } +std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector & tokens) { + std::string res; + for (auto t : tokens) { + res += vocab.id_to_token.at(t); + } + return res; +} +std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens) { + return vocab.id_to_token.at(tokens); +} bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); diff --git a/utils.h b/utils.h index 49658f7d9..3a1cc58a0 100644 --- a/utils.h +++ b/utils.h @@ -81,6 +81,11 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // ref: https://github.com/google/sentencepiece std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +// convert tokens to string +// opposite llama_tokenize +std::string llama_tokens_to_string(const gpt_vocab & vocab, const std::vector & tokens); +inline std::string single_llama_token_to_string(const gpt_vocab & vocab, const gpt_vocab::id & tokens); + // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);