From 8f5b0eaa8a2a612c426dfc2f8e11434a0c3292de Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 26 Sep 2023 23:23:59 +0200
Subject: [PATCH] llama.cpp : add llama_get_model common : add llama_tokenize
 from model

---
 common/common.cpp | 13 ++++++++++---
 common/common.h   |  7 ++++++-
 llama.cpp         |  4 ++++
 llama.h           |  2 ++
 4 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 0053d3c33..77eb985b5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -821,16 +821,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 //
 
 std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    return llama_tokenize(llama_get_model(ctx), text, add_bos);
+}
+
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
            const std::string & text,
                         bool   add_bos) {
     // upper limit for the number of tokens
     int n_tokens = text.length() + add_bos;
     std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize_with_model(model, text.data(), text.length(), result.data(), result.size(), add_bos);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize_with_model(model, text.data(), text.length(), result.data(), result.size(), add_bos);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index 8625d0341..8bed066b4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -143,7 +143,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
+  const struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize(
+    const struct llama_model * model,
            const std::string & text,
                         bool   add_bos);
 
diff --git a/llama.cpp b/llama.cpp
index c561451ab..0dca84b1b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6427,6 +6427,10 @@ void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
 
+const llama_model * llama_get_model(const struct llama_context * ctx) {
+    return &ctx->model;
+}
+
 int llama_n_vocab(const struct llama_context * ctx) {
     return llama_model_n_vocab(&ctx->model);
 }
diff --git a/llama.h b/llama.h
index 14c19747a..c2d296a66 100644
--- a/llama.h
+++ b/llama.h
@@ -251,6 +251,8 @@ extern "C" {
     LLAMA_API bool llama_mmap_supported (void);
     LLAMA_API bool llama_mlock_supported(void);
 
+    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+
     LLAMA_API int llama_n_vocab    (const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);