From 981c9131f0f20c10099735c1e353534b5bfe1e59 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Wed, 23 Aug 2023 16:07:07 +0800
Subject: [PATCH] gguf for llama is working

---
 gpttype_adapter.cpp    | 151 ++++++++++++++++++++------
 model_adapter.h        |   3 +-
 otherarch/llama_v3.cpp | 235 ++++++++++++++++++++++-------------------
 3 files changed, 247 insertions(+), 142 deletions(-)
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index a4449521e..9eb7ede2c 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -14,6 +14,7 @@
 //for easier compilation
 //concat source files into one file for compilation purposes
 #include "llama_v2.cpp"
+#include "llama_v3.cpp"
 #include "llama.cpp"
 #include "utils.cpp"
 #include "gptj_v1.cpp"
@@ -59,10 +60,9 @@ static mpt_model mpt_ctx_v3;
 
 static rwkv_v2_context * rwkv_ctx_v2;
 static rwkv_context * rwkv_ctx_v3;
-static llama_v2_context_params llama_ctx_params_v2;
-static llama_context_params llama_ctx_params;
 static llama_v2_context * llama_ctx_v2;
-static llama_context * llama_ctx_v3;
+static llama_v3_context * llama_ctx_v3;
+static llama_context * llama_ctx_v4;
 
 static gpt_params params;
 static int n_past = 0;
@@ -324,9 +324,13 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
     {
         return std::string(llama_v2_token_to_str(llama_ctx_v2, id));
     }
-    else if (file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+    else if (file_format == FileFormat::GGJT_3)
     {
-        return std::string(llama_token_to_str(llama_ctx_v3, id));
+        return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
+    }
+    else if( file_format == FileFormat::GGUF_LLAMA)
+    {
+        return std::string(llama_token_to_str(llama_ctx_v4, id));
     }
     else
     {
@@ -423,8 +427,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     {
         //newer format has bit unshuffling
         SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
-
-        llama_ctx_params_v2 = llama_v2_context_default_params();
+        llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
         llama_ctx_params_v2.n_ctx = inputs.max_context_length;
         //llama_ctx_params.n_parts = -1;
         llama_ctx_params_v2.seed = -1;
@@ -470,9 +473,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
         return ModelLoadResult::SUCCESS;
     }
-    else if(file_format == FileFormat::GGJT_3 || file_format==FileFormat::GGUF_LLAMA)
+    else if(file_format == FileFormat::GGJT_3)
     {
-        llama_ctx_params = llama_context_default_params();
+        llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
         llama_ctx_params.n_ctx = inputs.max_context_length;
         //llama_ctx_paran_parts = -1;
         llama_ctx_params.seed = -1;
@@ -503,7 +506,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
         #endif
 
-        llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
+        llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params);
 
         if (llama_ctx_v3 == NULL)
         {
@@ -520,7 +523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                 lora_base_arg = lora_base.c_str();
             }
 
-            int err = llama_apply_lora_from_file(llama_ctx_v3,
+            int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
                                                  lora_filename.c_str(),
                                                  lora_base_arg,
                                                  n_threads);
@@ -533,7 +536,77 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
 
         //determine mem per token
         const std::vector<int> tmp = {1, 2, 3, 4};
-        auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
+        auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
+        if(er!=0)
+        {
+            printf("\nLLAMA EVAL returned nonzero!\n");
+        }
+        return ModelLoadResult::SUCCESS;
+    }
+    else if(file_format==FileFormat::GGUF_LLAMA)
+    {
+        llama_context_params llama_ctx_params = llama_context_default_params();
+        llama_ctx_params.n_ctx = inputs.max_context_length;
+        //llama_ctx_paran_parts = -1;
+        llama_ctx_params.seed = -1;
+        llama_ctx_params.f16_kv = inputs.f16_kv;
+        llama_ctx_params.low_vram = inputs.low_vram;
+        llama_ctx_params.mul_mat_q = inputs.use_mmq;
+        llama_ctx_params.logits_all = false;
+        llama_ctx_params.use_mmap = inputs.use_mmap;
+        llama_ctx_params.use_mlock = inputs.use_mlock;
+        llama_ctx_params.n_gpu_layers = inputs.gpulayers;
+        llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
+        llama_ctx_params.rope_freq_base = rope_freq_base;
+        llama_ctx_params.rope_freq_scale = rope_freq_scale;
+        llama_ctx_params.n_batch = blasbatchsize;
+
+        #if defined(GGML_USE_CUBLAS)
+        bool ts_all_zero = true;
+        for (int i = 0; i < tensor_split_max; ++i) {
+            if (inputs.tensor_split[i] != 0.0f) {
+                ts_all_zero = false;
+                break;
+            }
+        }
+        if(!ts_all_zero)
+        {
+            llama_ctx_params.tensor_split = inputs.tensor_split;
+            printf("CUBLAS: Applying Custom Tensor Split!\n");
+        }
+        #endif
+
+        llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
+
+        if (llama_ctx_v4 == NULL)
+        {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
+            return ModelLoadResult::FAIL;
+        }
+        if (lora_filename != "")
+        {
+            printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
+
+            const char * lora_base_arg = NULL;
+            if (lora_base != "") {
+                printf("Using LORA base model: %s\n", lora_base.c_str());
+                lora_base_arg = lora_base.c_str();
+            }
+
+            int err = llama_apply_lora_from_file(llama_ctx_v4,
+                                                 lora_filename.c_str(),
+                                                 lora_base_arg,
+                                                 n_threads);
+            if (err != 0)
+            {
+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+                return ModelLoadResult::FAIL;
+            }
+        }
+
+        //determine mem per token
+        const std::vector<int> tmp = {1, 2, 3, 4};
+        auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads);
         if(er!=0)
         {
             printf("\nLLAMA EVAL returned nonzero!\n");
@@ -949,7 +1022,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
 
     if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2  || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
     {
-        params.prompt.insert(0, 1, ' ');
         if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
         {
             embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
@@ -958,9 +1030,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
         {
             embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
         }
+        else if (file_format == FileFormat::GGJT_3)
+        {
+            embd_inp = ::llama_v3_tokenize(llama_ctx_v3, params.prompt, true);
+        }
         else
         {
-            embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true);
+            embd_inp = ::llama_tokenize(llama_ctx_v4, params.prompt, true);
         }
     }
     else
@@ -1067,9 +1143,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     {
         n_vocab = llama_v2_n_vocab(llama_ctx_v2);
     }
-    else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+    else if(file_format == FileFormat::GGJT_3)
     {
-        n_vocab = llama_n_vocab(llama_ctx_v3);
+        n_vocab = llama_v3_n_vocab(llama_ctx_v3);
+    }
+    else if(file_format == FileFormat::GGUF_LLAMA)
+    {
+        n_vocab = llama_n_vocab(llama_ctx_v4);
     }
     else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
     {
@@ -1214,9 +1294,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             {
                 evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
             }
-            else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+            else if(file_format == FileFormat::GGJT_3)
             {
-                evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
+                evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
+            }
+            else if(file_format == FileFormat::GGUF_LLAMA)
+            {
+                evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0);
             }
             else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
             {
@@ -1324,28 +1408,33 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             int btsize = banned_token_ids.size();
             if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
             {
-                if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+                if(file_format == FileFormat::GGUF_LLAMA)
                 {
-                    logitsPtr = llama_get_logits(llama_ctx_v3);
+                    logitsPtr = llama_get_logits(llama_ctx_v4);
+                    eosID = llama_token_eos(llama_ctx_v4);
+                }
+                else if(file_format == FileFormat::GGJT_3)
+                {
+                    logitsPtr = llama_v3_get_logits(llama_ctx_v3);
+                    eosID = llama_v3_token_eos();
                 }
                 else
                 {
                     logitsPtr = llama_v2_get_logits(llama_ctx_v2);
+                    eosID = llama_v3_token_eos();
                 }
 
-                eosID = llama_token_eos(llama_ctx_v3);
-
                 if (!unbanTokens)
                 {
-                    // set the logit of the eos token (2) to zero to avoid sampling it
-                    logitsPtr[eosID] = 0;
+                    // set the logit of the eos token (2) to -INF to avoid sampling it
+                    logitsPtr[eosID] = -INFINITY;
                 }
 
                 if(btsize>0)
                 {
                     for(int t=0;t<btsize;++t)
                     {
-                        logitsPtr[banned_token_ids[t]]=0;
+                        logitsPtr[banned_token_ids[t]]=-INFINITY;
                     }
                 }
             }
@@ -1369,8 +1458,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                         eosID = 50256;
                         if(logits.size() > eosID)
                         {
-                            int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
-                            logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
+                            logits[eosID] = -INFINITY;
                         }
                         else
                         {
@@ -1378,8 +1466,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                             if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
                             {
                                 eosID = 0;
-                                int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
-                                logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
+                                logits[eosID] = -INFINITY;
                             }
                         }
                     }
@@ -1397,17 +1484,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                          file_format == FileFormat::MPT_1)
                     {
                         eosID = 0;
-                        int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
-                        logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
+                        logits[eosID] = -INFINITY;
                     }
                 }
 
                 if(btsize>0)
                 {
-                    int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
                     for (int t = 0; t < btsize; ++t)
                     {
-                        logits[banned_token_ids[t]] = (logits[topid] < 0 ? logits[topid] : 0);
+                        logits[banned_token_ids[t]] = -INFINITY;
                     }
                 }
             }
diff --git a/model_adapter.h b/model_adapter.h
index 2974d3455..f4e8a7034 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -21,6 +21,7 @@ enum FileFormat
     GGJT=3, // 3=(llama ggjt)
     GGJT_2=4, //newer llama format unshuffled
     GGJT_3=5, //using 16bit scalar
+    GGUF_LLAMA=6, //GGUF (llama newest ver)
 
     GPTJ_1=100, //the very first super old GPTJ format
     GPTJ_2=101, //pygmalion, uses old ggml lib
@@ -47,7 +48,7 @@ enum FileFormat
     MPT_1=500, //first supported mpt version
 
 
-    GGUF_LLAMA=1000, //GGUF (llama newest ver)
+
 };
 
 enum ModelLoadResult
diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp
index f40d3e742..bfe05cd53 100644
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@@ -74,18 +74,18 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
 
 
 // available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_3B,
-    MODEL_7B,
-    MODEL_13B,
-    MODEL_30B,
-    MODEL_65B,
-    MODEL_70B,
+enum e_model3 {
+    MODEL_UNKNOWN_3,
+    MODEL_3B_3,
+    MODEL_7B_3,
+    MODEL_13B_3,
+    MODEL_30B_3,
+    MODEL_65B_3,
+    MODEL_70B_3,
 };
 
-static const size_t kB = 1024;
-static const size_t MB = 1024*1024;
+static const size_t kB3 = 1024;
+static const size_t MB3 = 1024*1024;
 
 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
@@ -101,7 +101,7 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
 // ggml helpers
 //
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {
@@ -112,76 +112,77 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
     ggml_graph_compute(graph, &plan);
 }
 
+
 //
 // memory sizes (calculated for n_batch == 512)
 //
 
-static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
+static std::map<e_model3, size_t> MEM_REQ_SCRATCH0_3(int n_ctx)
 {
-    std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   ((size_t) n_ctx / 16ull + 156ull) * MB },
-        { MODEL_7B,   ((size_t) n_ctx / 16ull + 164ull) * MB },
-        { MODEL_13B,  ((size_t) n_ctx / 12ull + 184ull) * MB },
-        { MODEL_30B,  ((size_t) n_ctx /  9ull + 224ull) * MB },
-        { MODEL_65B,  ((size_t) n_ctx /  6ull + 320ull) * MB }, // guess
-        { MODEL_70B,  ((size_t) n_ctx /  7ull + 320ull) * MB },
+    std::map<e_model3, size_t> k_sizes = {
+        { MODEL_3B_3,   ((size_t) n_ctx / 16ull + 156ull) * MB3 },
+        { MODEL_7B_3,   ((size_t) n_ctx / 16ull + 164ull) * MB3 },
+        { MODEL_13B_3,  ((size_t) n_ctx / 12ull + 184ull) * MB3 },
+        { MODEL_30B_3,  ((size_t) n_ctx /  9ull + 224ull) * MB3 },
+        { MODEL_65B_3,  ((size_t) n_ctx /  6ull + 320ull) * MB3 }, // guess
+        { MODEL_70B_3,  ((size_t) n_ctx /  7ull + 320ull) * MB3 },
     };
     return k_sizes;
 }
 
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
+static const std::map<e_model3, size_t> & MEM_REQ_SCRATCH1_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,  192ull * MB },
-        { MODEL_7B,  224ull * MB },
-        { MODEL_13B, 256ull * MB },
-        { MODEL_30B, 320ull * MB },
-        { MODEL_65B, 448ull * MB }, // guess
-        { MODEL_70B, 448ull * MB },
+    static std::map<e_model3, size_t> k_sizes = {
+        { MODEL_3B_3,  192ull * MB3 },
+        { MODEL_7B_3,  224ull * MB3 },
+        { MODEL_13B_3, 256ull * MB3 },
+        { MODEL_30B_3, 320ull * MB3 },
+        { MODEL_65B_3, 448ull * MB3 }, // guess
+        { MODEL_70B_3, 448ull * MB3 },
     };
     return k_sizes;
 }
 
 // used to store the compute graph tensors + non-scratch data
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model3, size_t> & MEM_REQ_EVAL_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,  16ull * MB },
-        { MODEL_7B,  20ull * MB },
-        { MODEL_13B, 24ull * MB },
-        { MODEL_30B, 32ull * MB },
-        { MODEL_65B, 48ull * MB }, // guess
-        { MODEL_70B, 48ull * MB },
+    static std::map<e_model3, size_t> k_sizes = {
+        { MODEL_3B_3,  16ull * MB3 },
+        { MODEL_7B_3,  20ull * MB3 },
+        { MODEL_13B_3, 24ull * MB3 },
+        { MODEL_30B_3, 32ull * MB3 },
+        { MODEL_65B_3, 48ull * MB3 }, // guess
+        { MODEL_70B_3, 48ull * MB3 },
     };
     return k_sizes;
 }
 
 // amount of VRAM needed per batch size to hold temporary results
 // the values for 3b are not derived from testing but instead chosen conservatively
-static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
+static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_BASE_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,   512ull * kB },
-        { MODEL_7B,   512ull * kB },
-        { MODEL_13B,  640ull * kB },
-        { MODEL_30B,  768ull * kB },
-        { MODEL_65B, 1360ull * kB },
-        { MODEL_70B, 1360ull * kB },
+    static std::map<e_model3, size_t> k_sizes = {
+        { MODEL_3B_3,   512ull * kB3 },
+        { MODEL_7B_3,   512ull * kB3 },
+        { MODEL_13B_3,  640ull * kB3 },
+        { MODEL_30B_3,  768ull * kB3 },
+        { MODEL_65B_3, 1360ull * kB3 },
+        { MODEL_70B_3, 1360ull * kB3 },
     };
     return k_sizes;
 }
 
 // amount of VRAM needed per batch size and context to hold temporary results
 // the values for 3b are not derived from testing but instead chosen conservatively
-static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
+static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
-        { MODEL_3B,  128ull },
-        { MODEL_7B,  128ull },
-        { MODEL_13B, 160ull },
-        { MODEL_30B, 208ull },
-        { MODEL_65B, 320ull },
-        { MODEL_70B, 320ull },
+    static std::map<e_model3, size_t> k_sizes = {
+        { MODEL_3B_3,  128ull },
+        { MODEL_7B_3,  128ull },
+        { MODEL_13B_3, 160ull },
+        { MODEL_30B_3, 208ull },
+        { MODEL_65B_3, 320ull },
+        { MODEL_70B_3, 320ull },
     };
     return k_sizes;
 }
@@ -288,7 +289,7 @@ struct llama_v3_vocab {
 };
 
 struct llama_v3_model {
-    e_model type = MODEL_UNKNOWN;
+    e_model3 type = MODEL_UNKNOWN_3;
 
     llama_v3_hparams hparams;
 
@@ -452,13 +453,13 @@ struct llama_v3_state {
     void * log_callback_user_data = nullptr;
 };
 // global state
-static llama_v3_state g_state;
+static llama_v3_state llv3_g_state;
 
 template <typename T>
 static T checked_mul(T a, T b) {
     T ret = a * b;
     if (a != 0 && ret / a != b) {
-        throw std::runtime_error(format("overflow multiplying %llu * %llu",
+        throw std::runtime_error(format_old("overflow multiplying %llu * %llu",
                      (unsigned long long) a, (unsigned long long) b));
     }
     return ret;
@@ -466,7 +467,7 @@ static T checked_mul(T a, T b) {
 
 static size_t checked_div(size_t a, size_t b) {
     if (b == 0 || a % b != 0) {
-        throw std::runtime_error(format("error dividing %zu / %zu", a, b));
+        throw std::runtime_error(format_old("error dividing %zu / %zu", a, b));
     }
     return a / b;
 }
@@ -550,7 +551,7 @@ struct llama_v3_file_loader {
                 }
         }
 
-        throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+        throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                      magic, version));
     }
     void read_hparams() {
@@ -593,7 +594,7 @@ struct llama_v3_file_loader {
             file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
             std::string name = file.read_string(name_len);
             if (n_dims < 1 || n_dims > 2) {
-                throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
+                throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
             }
             switch (tensor.type) {
                 case GGML_TYPE_F32:
@@ -610,7 +611,7 @@ struct llama_v3_file_loader {
                 case GGML_TYPE_Q6_K:
                     break;
                 default: {
-                    throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
+                    throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type));
                 }
             }
 
@@ -721,11 +722,11 @@ struct llama_v3_model_loader {
     struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
         auto it = tensors_map.name_to_idx.find(name);
         if (it == tensors_map.name_to_idx.end()) {
-            throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
+            throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
         }
         llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second);
         if (lt.ne != ne) {
-            throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
+            throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                          name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str()));
         }
 
@@ -869,7 +870,7 @@ static bool kv_cache_init(
     const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3);
     cache.n = 0;
 
     struct ggml_init_params params;
@@ -952,7 +953,7 @@ bool llama_v3_mlock_supported() {
     return llama_v3_mlock::SUPPORTED;
 }
 
-int get_blas_batch_mul(int batch)
+int get_blas_batch_mul3(int batch)
 {
     return (batch>512?(batch>1024?4:2):1);
 }
@@ -1027,14 +1028,14 @@ const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) {
     }
 }
 
-static const char * llama_v3_model_type_name(e_model type) {
+static const char * llama_v3_model_type_name(e_model3 type) {
     switch (type) {
-        case MODEL_3B: return "3B";
-        case MODEL_7B: return "7B";
-        case MODEL_13B: return "13B";
-        case MODEL_30B: return "30B";
-        case MODEL_65B: return "65B";
-        case MODEL_70B: return "70B";
+        case MODEL_3B_3: return "3B";
+        case MODEL_7B_3: return "7B";
+        case MODEL_13B_3: return "13B";
+        case MODEL_30B_3: return "30B";
+        case MODEL_65B_3: return "65B";
+        case MODEL_70B_3: return "70B";
         default: LLAMA_V3_ASSERT(false);
     }
 }
@@ -1062,7 +1063,7 @@ static void llama_v3_model_load_internal(
         void * progress_callback_user_data) {
 
     model.t_start_us = ggml_time_us();
-    size_t blasbatchmul = get_blas_batch_mul(n_batch);
+    size_t blasbatchmul = get_blas_batch_mul3(n_batch);
 
     std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
 
@@ -1078,15 +1079,15 @@ static void llama_v3_model_load_internal(
 
     {
         switch (hparams.n_layer) {
-            case 26: model.type = e_model::MODEL_3B; break;
-            case 32: model.type = e_model::MODEL_7B; break;
-            case 40: model.type = e_model::MODEL_13B; break;
-            case 60: model.type = e_model::MODEL_30B; break;
-            case 80: model.type = e_model::MODEL_65B; break;
+            case 26: model.type = e_model3::MODEL_3B_3; break;
+            case 32: model.type = e_model3::MODEL_7B_3; break;
+            case 40: model.type = e_model3::MODEL_13B_3; break;
+            case 60: model.type = e_model3::MODEL_30B_3; break;
+            case 80: model.type = e_model3::MODEL_65B_3; break;
             default:
                 {
                     if (hparams.n_layer < 32) {
-                        model.type = e_model::MODEL_7B;
+                        model.type = e_model3::MODEL_7B_3;
                     }
                 } break;
         }
@@ -1096,15 +1097,15 @@ static void llama_v3_model_load_internal(
         // LLaMAv2
         // TODO: temporary until GGUF
         //patch for llama2 gqa
-        if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
+        if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
             fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
             n_gqa = 8;
         }
         LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0);
         hparams.n_head_kv = hparams.n_head / n_gqa;
-        if (model.type == e_model::MODEL_65B && n_gqa == 8) {
+        if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) {
             LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
-            model.type = e_model::MODEL_70B;
+            model.type = e_model3::MODEL_70B_3;
             hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
         }
 
@@ -1180,7 +1181,7 @@ static void llama_v3_model_load_internal(
 
         model.ctx = ggml_init(params);
         if (!model.ctx) {
-            throw std::runtime_error(format("ggml_init() failed"));
+            throw std::runtime_error(format_old("ggml_init() failed"));
         }
     }
 
@@ -1289,9 +1290,9 @@ static void llama_v3_model_load_internal(
 
 #ifndef LLAMA_V3_USE_ALLOCATOR
         mem_required +=
-            blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
-            blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) +
-            blasbatchmul*MEM_REQ_EVAL().at(model.type);
+            blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) +
+            blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) +
+            blasbatchmul*MEM_REQ_EVAL_3().at(model.type);
 #endif
 
         // this is the memory required by one llama_v3_state
@@ -1308,8 +1309,8 @@ static void llama_v3_model_load_internal(
             LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
             ggml_cuda_set_scratch_size(0); // disable scratch
         } else {
-            const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
-            const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
+            const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type);
+            const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type);
             vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
             ggml_cuda_set_scratch_size(vram_scratch);
             if (n_gpu_layers > 0) {
@@ -1872,10 +1873,10 @@ static bool llama_v3_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
         }
     } else {
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #else
-    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+    llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
 
 #if GGML_USE_MPI
@@ -1939,7 +1940,7 @@ static bool llama_v3_eval_internal(
 // tokenizer
 //
 
-static size_t utf8_len(char src) {
+static size_t utf8_len3(char src) {
     const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
     uint8_t highbits = static_cast<uint8_t>(src) >> 4;
     return lookup[highbits];
@@ -1980,7 +1981,7 @@ struct llama_v3_tokenizer {
         size_t offs = 0;
         while (offs < text.size()) {
             llama_v3_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs]));
             sym.text = text.c_str() + offs;
             sym.n = char_len;
             offs += char_len;
@@ -2076,6 +2077,24 @@ private:
     llama_v3_sp_bigram::queue work_queue_;
 };
 
+std::vector<llama_token> llama_v3_tokenize(
+        struct llama_v3_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
 static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) {
     llama_v3_tokenizer tokenizer(vocab);
     std::vector<llama_v3_vocab::id> output;
@@ -3010,10 +3029,10 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
     if (ggml_is_quantized(tensor.type)) {
         qtype = ggml_internal_get_type_traits(tensor.type);
         if (qtype.to_float == NULL) {
-            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
+            throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
         }
     } else if (tensor.type != GGML_TYPE_F16) {
-        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
+        throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
     }
 
     if (nthread < 2) {
@@ -3084,7 +3103,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
         case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
         case LLAMA_V3_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
 #endif
-        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+        default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype));
     }
 
     if (nthread <= 0) {
@@ -3209,7 +3228,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
             if (tensor.type == GGML_TYPE_F32) {
                 f32_data = (float *) tensor.data;
             } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
-                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
+                throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
             } else {
                 llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
                 f32_data = (float *) f32_conv_buf.addr;
@@ -3348,7 +3367,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
         params.seed = time(NULL);
     }
 
-    size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
+    size_t blasbatchmul = get_blas_batch_mul3(params.n_batch);
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
@@ -3430,9 +3449,9 @@ struct llama_v3_context * llama_v3_new_context_with_model(
 
             // debug - for comparison with scratch buffer
             //size_t prev_req =
-            //    MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
-            //    MEM_REQ_SCRATCH1().at(ctx->model.type) +
-            //    MEM_REQ_EVAL().at(ctx->model.type);
+            //    MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) +
+            //    MEM_REQ_SCRATCH1_3().at(ctx->model.type) +
+            //    MEM_REQ_EVAL_3().at(ctx->model.type);
             //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
 
             // recreate allocator with exact memory requirements
@@ -3447,12 +3466,12 @@ struct llama_v3_context * llama_v3_new_context_with_model(
 #endif
         }
 #else
-        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
+        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead());
 #endif
 
 #ifdef LLAMA_V3_USE_SCRATCH
-        ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
-        ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
+        ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type));
+        ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type));
 #endif
     }
 
@@ -3711,7 +3730,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
 #ifdef GGML_USE_CUBLAS
             if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                 if (dest_t->type != GGML_TYPE_F16) {
-                    throw std::runtime_error(format(
+                    throw std::runtime_error(format_old(
                         "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
                 }
                 offload_func = ggml_cuda_assign_buffers;
@@ -3791,7 +3810,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
 
             struct ggml_cgraph gf = ggml_build_forward(r);
 
-            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
+            llv3_graph_compute_helper(work_buffer, &gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
@@ -3977,7 +3996,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
 
@@ -4087,7 +4106,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
@@ -4419,8 +4438,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_intern
 
 
 void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
-    g_state.log_callback_user_data = user_data;
+    llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
+    llv3_g_state.log_callback_user_data = user_data;
 }
 
 #if defined(_MSC_VER) && !defined(vsnprintf)
@@ -4433,12 +4452,12 @@ static void llama_v3_log_internal_v(llama_v3_log_level level, const char * forma
     char buffer[128];
     int len = vsnprintf(buffer, 128, format, args);
     if (len < 128) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
+        llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data);
     } else {
         char* buffer2 = new char[len+1];
         vsnprintf(buffer2, len+1, format, args_copy);
         buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
+        llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data);
         delete[] buffer2;
     }
     va_end(args_copy);