From 1b5442923ac017ee63cd405ffcedc6af1df1aaa0 Mon Sep 17 00:00:00 2001
From: goerch <jhr.walter@t-online.de>
Date: Sun, 6 Aug 2023 07:47:55 +0200
Subject: [PATCH] Fix tokenizer regression in convert.py and improve CPP
 interface for llama_tokenize

---
 convert.py                                    |  1 +
 examples/common.cpp                           | 11 ------
 examples/common.h                             |  6 ---
 examples/save-load-state/save-load-state.cpp  |  5 +--
 .../train-text-from-scratch.cpp               |  9 ++---
 llama.cpp                                     | 38 ++++++++++++++++++-
 llama.h                                       | 14 +++++++
 tests/test-tokenizer-0.cpp                    | 10 ++---
 tests/test-tokenizer-1.cpp                    | 18 ++++-----
 9 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/convert.py b/convert.py
index dbd9366b9..d4750df9f 100755
--- a/convert.py
+++ b/convert.py
@@ -276,6 +276,7 @@ class SentencePieceVocab:
               piece = tokenizer.id_to_piece(i)
               text: bytes = piece.encode("utf-8")
               score: float = tokenizer.get_score(i)
+              yield text, score
 
     def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
         for text in self.added_tokens_list:
diff --git a/examples/common.cpp b/examples/common.cpp
index 9f1c0d904..d2fa68ad9 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -608,17 +608,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
     return "The";
 }
 
-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos + 1);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
     auto lparams = llama_context_default_params();
 
diff --git a/examples/common.h b/examples/common.h
index 75f622c77..cea1fee02 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -102,12 +102,6 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
 
-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
 //
 // Model utils
 //
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index d09c27dae..3db61b754 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -44,9 +44,8 @@ int main(int argc, char ** argv) {
         llama_free_model(model);
         return 1;
     }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
-
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto n_prompt_tokens = tokens.size();
     if (n_prompt_tokens < 1) {
         fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
         llama_free(ctx);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 6e25265d7..d446c6ea5 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -2189,11 +2189,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
     f.read_raw(buf.data(), f.size);
     buf[f.size] = '\0';
 
-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    if (n_tokens < 0) {
+        out.resize(-n_tokens);
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
     }
 
     bool verify = false;
diff --git a/llama.cpp b/llama.cpp
index c5ca25dfc..53f73f6b8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4057,7 +4057,6 @@ int llama_tokenize_with_model(
     auto res = llama_tokenize(model->vocab, text, add_bos, true);
 
     if (n_max_tokens < (int) res.size()) {
-        fprintf(stderr, "%s: too many tokens\n", __func__);
         return -((int) res.size());
     }
 
@@ -4077,6 +4076,24 @@ int llama_tokenize(
     return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
 }
 
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int length = text.length() + add_bos;
+    std::vector<llama_token> result(length);
+    length = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
 int llama_tokenize_bpe(
         struct llama_context * ctx,
                   const char * text,
@@ -4086,7 +4103,6 @@ int llama_tokenize_bpe(
     auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);
 
     if (n_max_tokens < (int) res.size()) {
-        fprintf(stderr, "%s: too many tokens\n", __func__);
         return -((int) res.size());
     }
 
@@ -4097,6 +4113,24 @@ int llama_tokenize_bpe(
     return res.size();
 }
 
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int length = text.length() + add_bos;
+    std::vector<llama_token> result(length);
+    length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
 int llama_n_vocab_from_model(const struct llama_model * model) {
     return model->vocab.id_to_token.size();
 }
diff --git a/llama.h b/llama.h
index 435ce7ba7..b67867ae3 100644
--- a/llama.h
+++ b/llama.h
@@ -476,6 +476,20 @@ extern "C" {
 #include <vector>
 #include <string>
 
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
 std::string llama_token_to_str(
         const struct llama_context * ctx,
                        llama_token   token);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index f8642996a..a523c320c 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -6,9 +6,9 @@
 #include <map>
 #include <vector>
 
-static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
+static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
     std::string result;
-    for (int i = 0; i < count; ++i) {
+    for (int i = 0; i < tokens.size(); ++i) {
         result += llama_token_to_str(ctx, tokens[i]);
     }
     return result;
@@ -90,11 +90,9 @@ int main(int argc, char **argv) {
     }
 
     for (const auto & test_kv : k_tests()) {
-        std::vector<llama_token> res(test_kv.first.size() + 2);
-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
+        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first.c_str(), true);
         fprintf(stderr, "%s : '%s' tokenized to '%s'\n", 
-            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str());
-        res.resize(n);
+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
 
         bool correct = res.size() == test_kv.second.size();
 
diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp
index cde7a203b..3803dad8d 100644
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -28,9 +28,9 @@ static std::string escape_whitespace(const std::string& text) {
     return result;
 }
 
-static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) {
+static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
     std::string result;
-    for (int i = 0; i < count; ++i) {
+    for (int i = 0; i < tokens.size(); ++i) {
         result += llama_token_to_str(ctx, tokens[i]);
     }
     return result;
@@ -84,9 +84,8 @@ int main(int argc, char **argv) {
 
     for (int i = 0; i < n_vocab; ++i) {
         std::string forward = llama_token_to_str_bpe(ctx, i);
-        std::vector<llama_token> tokens(forward.length());
-        int n = llama_tokenize_bpe(ctx, forward.c_str(), tokens.data(), forward.length(), false);
-        if (n == 1) {
+        std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
+        if (tokens.size() == 1) {
             if (i != tokens[0]) {
                 std::string backward = llama_token_to_str(ctx, tokens[0]);
                 fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", 
@@ -96,10 +95,10 @@ int main(int argc, char **argv) {
         } else {
             if (i <= 258) {
                 fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n", 
-                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
             } else {
                 fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n", 
-                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str());
+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
                 return 3;
             }
         }
@@ -109,9 +108,8 @@ int main(int argc, char **argv) {
     for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
         std::wstring wstr(1, ch);
         std::string str = converter.to_bytes(wstr);
-        std::vector<llama_token> tokens(str.length() + 1);
-        auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length() + 1, false);
-        if (n == 1) {
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
+        if (tokens.size() == 1) {
             fprintf(stderr, "%s : info: %s tokenized to %d \n", 
                 __func__, str.c_str(), tokens[0]);
         }