gguf for llama is working

2023-08-23 16:07:07 +08:00 · 2023-08-23 16:07:07 +08:00 · 981c9131f0
commit 981c9131f0
parent 39cc83e8c9
3 changed files with 247 additions and 142 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -14,6 +14,7 @@
 //for easier compilation
 //concat source files into one file for compilation purposes
 #include "llama_v2.cpp"
 #include "llama_v3.cpp"
 #include "llama.cpp"
 #include "utils.cpp"
 #include "gptj_v1.cpp"
@ -59,10 +60,9 @@ static mpt_model mpt_ctx_v3;
 static rwkv_v2_context * rwkv_ctx_v2;
 static rwkv_context * rwkv_ctx_v3;
 static llama_v2_context_params llama_ctx_params_v2;
 static llama_context_params llama_ctx_params;
 static llama_v2_context * llama_ctx_v2;
-static llama_context * llama_ctx_v3;
+static llama_v3_context * llama_ctx_v3;
 static llama_context * llama_ctx_v4;
 static gpt_params params;
 static int n_past = 0;
@ -324,9 +324,13 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
    {
        return std::string(llama_v2_token_to_str(llama_ctx_v2, id));
    }
-    else if (file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+    else if (file_format == FileFormat::GGJT_3)
    {
-        return std::string(llama_token_to_str(llama_ctx_v3, id));
+        return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
    }
    else if( file_format == FileFormat::GGUF_LLAMA)
    {
        return std::string(llama_token_to_str(llama_ctx_v4, id));
    }
    else
    {
@ -423,8 +427,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    {
        //newer format has bit unshuffling
        SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
-
+        llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
        llama_ctx_params_v2 = llama_v2_context_default_params();
        llama_ctx_params_v2.n_ctx = inputs.max_context_length;
        //llama_ctx_params.n_parts = -1;
        llama_ctx_params_v2.seed = -1;
@ -470,9 +473,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
        return ModelLoadResult::SUCCESS;
    }
-    else if(file_format == FileFormat::GGJT_3 || file_format==FileFormat::GGUF_LLAMA)
+    else if(file_format == FileFormat::GGJT_3)
    {
-        llama_ctx_params = llama_context_default_params();
+        llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
        llama_ctx_params.n_ctx = inputs.max_context_length;
        //llama_ctx_paran_parts = -1;
        llama_ctx_params.seed = -1;
@ -503,7 +506,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        }
        #endif
-        llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
+        llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params);
        if (llama_ctx_v3 == NULL)
        {
@ -520,7 +523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
                lora_base_arg = lora_base.c_str();
            }
-            int err = llama_apply_lora_from_file(llama_ctx_v3,
+            int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
                                                 lora_filename.c_str(),
                                                 lora_base_arg,
                                                 n_threads);
@ -533,7 +536,77 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        //determine mem per token
        const std::vector<int> tmp = {1, 2, 3, 4};
-        auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
+        auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
        if(er!=0)
        {
            printf("\nLLAMA EVAL returned nonzero!\n");
        }
        return ModelLoadResult::SUCCESS;
    }
    else if(file_format==FileFormat::GGUF_LLAMA)
    {
        llama_context_params llama_ctx_params = llama_context_default_params();
        llama_ctx_params.n_ctx = inputs.max_context_length;
        //llama_ctx_paran_parts = -1;
        llama_ctx_params.seed = -1;
        llama_ctx_params.f16_kv = inputs.f16_kv;
        llama_ctx_params.low_vram = inputs.low_vram;
        llama_ctx_params.mul_mat_q = inputs.use_mmq;
        llama_ctx_params.logits_all = false;
        llama_ctx_params.use_mmap = inputs.use_mmap;
        llama_ctx_params.use_mlock = inputs.use_mlock;
        llama_ctx_params.n_gpu_layers = inputs.gpulayers;
        llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
        llama_ctx_params.rope_freq_base = rope_freq_base;
        llama_ctx_params.rope_freq_scale = rope_freq_scale;
        llama_ctx_params.n_batch = blasbatchsize;
        #if defined(GGML_USE_CUBLAS)
        bool ts_all_zero = true;
        for (int i = 0; i < tensor_split_max; ++i) {
            if (inputs.tensor_split[i] != 0.0f) {
                ts_all_zero = false;
                break;
            }
        }
        if(!ts_all_zero)
        {
            llama_ctx_params.tensor_split = inputs.tensor_split;
            printf("CUBLAS: Applying Custom Tensor Split!\n");
        }
        #endif
        llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
        if (llama_ctx_v4 == NULL)
        {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
            return ModelLoadResult::FAIL;
        }
        if (lora_filename != "")
        {
            printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
            const char * lora_base_arg = NULL;
            if (lora_base != "") {
                printf("Using LORA base model: %s\n", lora_base.c_str());
                lora_base_arg = lora_base.c_str();
            }
            int err = llama_apply_lora_from_file(llama_ctx_v4,
                                                 lora_filename.c_str(),
                                                 lora_base_arg,
                                                 n_threads);
            if (err != 0)
            {
                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
                return ModelLoadResult::FAIL;
            }
        }
        //determine mem per token
        const std::vector<int> tmp = {1, 2, 3, 4};
        auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads);
        if(er!=0)
        {
            printf("\nLLAMA EVAL returned nonzero!\n");
@ -949,7 +1022,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2  || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
    {
        params.prompt.insert(0, 1, ' ');
        if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
        {
            embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
@ -958,9 +1030,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
        {
            embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
        }
        else if (file_format == FileFormat::GGJT_3)
        {
            embd_inp = ::llama_v3_tokenize(llama_ctx_v3, params.prompt, true);
        }
        else
        {
-            embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true);
+            embd_inp = ::llama_tokenize(llama_ctx_v4, params.prompt, true);
        }
    }
    else
@ -1067,9 +1143,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    {
        n_vocab = llama_v2_n_vocab(llama_ctx_v2);
    }
-    else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+    else if(file_format == FileFormat::GGJT_3)
    {
-        n_vocab = llama_n_vocab(llama_ctx_v3);
+        n_vocab = llama_v3_n_vocab(llama_ctx_v3);
    }
    else if(file_format == FileFormat::GGUF_LLAMA)
    {
        n_vocab = llama_n_vocab(llama_ctx_v4);
    }
    else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
    {
@ -1214,9 +1294,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
            {
                evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
            }
-            else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+            else if(file_format == FileFormat::GGJT_3)
            {
-                evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
+                evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
            }
            else if(file_format == FileFormat::GGUF_LLAMA)
            {
                evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0);
            }
            else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
            {
@ -1324,28 +1408,33 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
            int btsize = banned_token_ids.size();
            if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
            {
-                if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
+                if(file_format == FileFormat::GGUF_LLAMA)
                {
-                    logitsPtr = llama_get_logits(llama_ctx_v3);
+                    logitsPtr = llama_get_logits(llama_ctx_v4);
                    eosID = llama_token_eos(llama_ctx_v4);
                }
                else if(file_format == FileFormat::GGJT_3)
                {
                    logitsPtr = llama_v3_get_logits(llama_ctx_v3);
                    eosID = llama_v3_token_eos();
                }
                else
                {
                    logitsPtr = llama_v2_get_logits(llama_ctx_v2);
                    eosID = llama_v3_token_eos();
                }
                eosID = llama_token_eos(llama_ctx_v3);
                if (!unbanTokens)
                {
-                    // set the logit of the eos token (2) to zero to avoid sampling it
+                    // set the logit of the eos token (2) to -INF to avoid sampling it
-                    logitsPtr[eosID] = 0;
+                    logitsPtr[eosID] = -INFINITY;
                }
                if(btsize>0)
                {
                    for(int t=0;t<btsize;++t)
                    {
-                        logitsPtr[banned_token_ids[t]]=0;
+                        logitsPtr[banned_token_ids[t]]=-INFINITY;
                    }
                }
            }
@ -1369,8 +1458,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                        eosID = 50256;
                        if(logits.size() > eosID)
                        {
-                            int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
+                            logits[eosID] = -INFINITY;
                            logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
                        }
                        else
                        {
@ -1378,8 +1466,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                            if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
                            {
                                eosID = 0;
-                                int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
+                                logits[eosID] = -INFINITY;
                                logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
                            }
                        }
                    }
@ -1397,17 +1484,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
                         file_format == FileFormat::MPT_1)
                    {
                        eosID = 0;
-                        int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
+                        logits[eosID] = -INFINITY;
                        logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
                    }
                }
                if(btsize>0)
                {
                    int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
                    for (int t = 0; t < btsize; ++t)
                    {
-                        logits[banned_token_ids[t]] = (logits[topid] < 0 ? logits[topid] : 0);
+                        logits[banned_token_ids[t]] = -INFINITY;
                    }
                }
            }
--- a/model_adapter.h
+++ b/model_adapter.h
@ -21,6 +21,7 @@ enum FileFormat
    GGJT=3, // 3=(llama ggjt)
    GGJT_2=4, //newer llama format unshuffled
    GGJT_3=5, //using 16bit scalar
    GGUF_LLAMA=6, //GGUF (llama newest ver)
    GPTJ_1=100, //the very first super old GPTJ format
    GPTJ_2=101, //pygmalion, uses old ggml lib
@ -47,7 +48,7 @@ enum FileFormat
    MPT_1=500, //first supported mpt version
-    GGUF_LLAMA=1000, //GGUF (llama newest ver)
+
 };
 enum ModelLoadResult
--- a/otherarch/llama_v3.cpp
+++ b/otherarch/llama_v3.cpp
@ -74,18 +74,18 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
 // available llama models
-enum e_model {
+enum e_model3 {
-    MODEL_UNKNOWN,
+    MODEL_UNKNOWN_3,
-    MODEL_3B,
+    MODEL_3B_3,
-    MODEL_7B,
+    MODEL_7B_3,
-    MODEL_13B,
+    MODEL_13B_3,
-    MODEL_30B,
+    MODEL_30B_3,
-    MODEL_65B,
+    MODEL_65B_3,
-    MODEL_70B,
+    MODEL_70B_3,
 };
-static const size_t kB = 1024;
+static const size_t kB3 = 1024;
-static const size_t MB = 1024*1024;
+static const size_t MB3 = 1024*1024;
 // computed for n_ctx == 2048
 // TODO: dynamically determine these sizes
@ -101,7 +101,7 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
 // ggml helpers
 //
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
    if (plan.work_size > 0) {
@ -112,76 +112,77 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
    ggml_graph_compute(graph, &plan);
 }
 //
 // memory sizes (calculated for n_batch == 512)
 //
-static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
+static std::map<e_model3, size_t> MEM_REQ_SCRATCH0_3(int n_ctx)
 {
-    std::map<e_model, size_t> k_sizes = {
+    std::map<e_model3, size_t> k_sizes = {
-        { MODEL_3B,   ((size_t) n_ctx / 16ull + 156ull) * MB },
+        { MODEL_3B_3,   ((size_t) n_ctx / 16ull + 156ull) * MB3 },
-        { MODEL_7B,   ((size_t) n_ctx / 16ull + 164ull) * MB },
+        { MODEL_7B_3,   ((size_t) n_ctx / 16ull + 164ull) * MB3 },
-        { MODEL_13B,  ((size_t) n_ctx / 12ull + 184ull) * MB },
+        { MODEL_13B_3,  ((size_t) n_ctx / 12ull + 184ull) * MB3 },
-        { MODEL_30B,  ((size_t) n_ctx /  9ull + 224ull) * MB },
+        { MODEL_30B_3,  ((size_t) n_ctx /  9ull + 224ull) * MB3 },
-        { MODEL_65B,  ((size_t) n_ctx /  6ull + 320ull) * MB }, // guess
+        { MODEL_65B_3,  ((size_t) n_ctx /  6ull + 320ull) * MB3 }, // guess
-        { MODEL_70B,  ((size_t) n_ctx /  7ull + 320ull) * MB },
+        { MODEL_70B_3,  ((size_t) n_ctx /  7ull + 320ull) * MB3 },
    };
    return k_sizes;
 }
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
+static const std::map<e_model3, size_t> & MEM_REQ_SCRATCH1_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
+    static std::map<e_model3, size_t> k_sizes = {
-        { MODEL_3B,  192ull * MB },
+        { MODEL_3B_3,  192ull * MB3 },
-        { MODEL_7B,  224ull * MB },
+        { MODEL_7B_3,  224ull * MB3 },
-        { MODEL_13B, 256ull * MB },
+        { MODEL_13B_3, 256ull * MB3 },
-        { MODEL_30B, 320ull * MB },
+        { MODEL_30B_3, 320ull * MB3 },
-        { MODEL_65B, 448ull * MB }, // guess
+        { MODEL_65B_3, 448ull * MB3 }, // guess
-        { MODEL_70B, 448ull * MB },
+        { MODEL_70B_3, 448ull * MB3 },
    };
    return k_sizes;
 }
 // used to store the compute graph tensors + non-scratch data
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
+static const std::map<e_model3, size_t> & MEM_REQ_EVAL_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
+    static std::map<e_model3, size_t> k_sizes = {
-        { MODEL_3B,  16ull * MB },
+        { MODEL_3B_3,  16ull * MB3 },
-        { MODEL_7B,  20ull * MB },
+        { MODEL_7B_3,  20ull * MB3 },
-        { MODEL_13B, 24ull * MB },
+        { MODEL_13B_3, 24ull * MB3 },
-        { MODEL_30B, 32ull * MB },
+        { MODEL_30B_3, 32ull * MB3 },
-        { MODEL_65B, 48ull * MB }, // guess
+        { MODEL_65B_3, 48ull * MB3 }, // guess
-        { MODEL_70B, 48ull * MB },
+        { MODEL_70B_3, 48ull * MB3 },
    };
    return k_sizes;
 }
 // amount of VRAM needed per batch size to hold temporary results
 // the values for 3b are not derived from testing but instead chosen conservatively
-static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
+static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_BASE_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
+    static std::map<e_model3, size_t> k_sizes = {
-        { MODEL_3B,   512ull * kB },
+        { MODEL_3B_3,   512ull * kB3 },
-        { MODEL_7B,   512ull * kB },
+        { MODEL_7B_3,   512ull * kB3 },
-        { MODEL_13B,  640ull * kB },
+        { MODEL_13B_3,  640ull * kB3 },
-        { MODEL_30B,  768ull * kB },
+        { MODEL_30B_3,  768ull * kB3 },
-        { MODEL_65B, 1360ull * kB },
+        { MODEL_65B_3, 1360ull * kB3 },
-        { MODEL_70B, 1360ull * kB },
+        { MODEL_70B_3, 1360ull * kB3 },
    };
    return k_sizes;
 }
 // amount of VRAM needed per batch size and context to hold temporary results
 // the values for 3b are not derived from testing but instead chosen conservatively
-static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
+static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT_3()
 {
-    static std::map<e_model, size_t> k_sizes = {
+    static std::map<e_model3, size_t> k_sizes = {
-        { MODEL_3B,  128ull },
+        { MODEL_3B_3,  128ull },
-        { MODEL_7B,  128ull },
+        { MODEL_7B_3,  128ull },
-        { MODEL_13B, 160ull },
+        { MODEL_13B_3, 160ull },
-        { MODEL_30B, 208ull },
+        { MODEL_30B_3, 208ull },
-        { MODEL_65B, 320ull },
+        { MODEL_65B_3, 320ull },
-        { MODEL_70B, 320ull },
+        { MODEL_70B_3, 320ull },
    };
    return k_sizes;
 }
@ -288,7 +289,7 @@ struct llama_v3_vocab {
 };
 struct llama_v3_model {
-    e_model type = MODEL_UNKNOWN;
+    e_model3 type = MODEL_UNKNOWN_3;
    llama_v3_hparams hparams;
@ -452,13 +453,13 @@ struct llama_v3_state {
    void * log_callback_user_data = nullptr;
 };
 // global state
-static llama_v3_state g_state;
+static llama_v3_state llv3_g_state;
 template <typename T>
 static T checked_mul(T a, T b) {
    T ret = a * b;
    if (a != 0 && ret / a != b) {
-        throw std::runtime_error(format("overflow multiplying %llu * %llu",
+        throw std::runtime_error(format_old("overflow multiplying %llu * %llu",
                     (unsigned long long) a, (unsigned long long) b));
    }
    return ret;
@ -466,7 +467,7 @@ static T checked_mul(T a, T b) {
 static size_t checked_div(size_t a, size_t b) {
    if (b == 0 || a % b != 0) {
-        throw std::runtime_error(format("error dividing %zu / %zu", a, b));
+        throw std::runtime_error(format_old("error dividing %zu / %zu", a, b));
    }
    return a / b;
 }
@ -550,7 +551,7 @@ struct llama_v3_file_loader {
                }
        }
-        throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+        throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                     magic, version));
    }
    void read_hparams() {
@ -593,7 +594,7 @@ struct llama_v3_file_loader {
            file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
            std::string name = file.read_string(name_len);
            if (n_dims < 1 || n_dims > 2) {
-                throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
+                throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
            }
            switch (tensor.type) {
                case GGML_TYPE_F32:
@ -610,7 +611,7 @@ struct llama_v3_file_loader {
                case GGML_TYPE_Q6_K:
                    break;
                default: {
-                    throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
+                    throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type));
                }
            }
@ -721,11 +722,11 @@ struct llama_v3_model_loader {
    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
-            throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
+            throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
        }
        llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second);
        if (lt.ne != ne) {
-            throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
+            throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                         name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str()));
        }
@ -869,7 +870,7 @@ static bool kv_cache_init(
    const int64_t n_mem      = n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3);
    cache.n = 0;
    struct ggml_init_params params;
@ -952,7 +953,7 @@ bool llama_v3_mlock_supported() {
    return llama_v3_mlock::SUPPORTED;
 }
-int get_blas_batch_mul(int batch)
+int get_blas_batch_mul3(int batch)
 {
    return (batch>512?(batch>1024?4:2):1);
 }
@ -1027,14 +1028,14 @@ const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) {
    }
 }
-static const char * llama_v3_model_type_name(e_model type) {
+static const char * llama_v3_model_type_name(e_model3 type) {
    switch (type) {
-        case MODEL_3B: return "3B";
+        case MODEL_3B_3: return "3B";
-        case MODEL_7B: return "7B";
+        case MODEL_7B_3: return "7B";
-        case MODEL_13B: return "13B";
+        case MODEL_13B_3: return "13B";
-        case MODEL_30B: return "30B";
+        case MODEL_30B_3: return "30B";
-        case MODEL_65B: return "65B";
+        case MODEL_65B_3: return "65B";
-        case MODEL_70B: return "70B";
+        case MODEL_70B_3: return "70B";
        default: LLAMA_V3_ASSERT(false);
    }
 }
@ -1062,7 +1063,7 @@ static void llama_v3_model_load_internal(
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();
-    size_t blasbatchmul = get_blas_batch_mul(n_batch);
+    size_t blasbatchmul = get_blas_batch_mul3(n_batch);
    std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
@ -1078,15 +1079,15 @@ static void llama_v3_model_load_internal(
    {
        switch (hparams.n_layer) {
-            case 26: model.type = e_model::MODEL_3B; break;
+            case 26: model.type = e_model3::MODEL_3B_3; break;
-            case 32: model.type = e_model::MODEL_7B; break;
+            case 32: model.type = e_model3::MODEL_7B_3; break;
-            case 40: model.type = e_model::MODEL_13B; break;
+            case 40: model.type = e_model3::MODEL_13B_3; break;
-            case 60: model.type = e_model::MODEL_30B; break;
+            case 60: model.type = e_model3::MODEL_30B_3; break;
-            case 80: model.type = e_model::MODEL_65B; break;
+            case 80: model.type = e_model3::MODEL_65B_3; break;
            default:
                {
                    if (hparams.n_layer < 32) {
-                        model.type = e_model::MODEL_7B;
+                        model.type = e_model3::MODEL_7B_3;
                    }
                } break;
        }
@ -1096,15 +1097,15 @@ static void llama_v3_model_load_internal(
        // LLaMAv2
        // TODO: temporary until GGUF
        //patch for llama2 gqa
-        if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
+        if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
            fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
            n_gqa = 8;
        }
        LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0);
        hparams.n_head_kv = hparams.n_head / n_gqa;
-        if (model.type == e_model::MODEL_65B && n_gqa == 8) {
+        if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) {
            LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
-            model.type = e_model::MODEL_70B;
+            model.type = e_model3::MODEL_70B_3;
            hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
        }
@ -1180,7 +1181,7 @@ static void llama_v3_model_load_internal(
        model.ctx = ggml_init(params);
        if (!model.ctx) {
-            throw std::runtime_error(format("ggml_init() failed"));
+            throw std::runtime_error(format_old("ggml_init() failed"));
        }
    }
@ -1289,9 +1290,9 @@ static void llama_v3_model_load_internal(
 #ifndef LLAMA_V3_USE_ALLOCATOR
        mem_required +=
-            blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
+            blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) +
-            blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) +
+            blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) +
-            blasbatchmul*MEM_REQ_EVAL().at(model.type);
+            blasbatchmul*MEM_REQ_EVAL_3().at(model.type);
 #endif
        // this is the memory required by one llama_v3_state
@ -1308,8 +1309,8 @@ static void llama_v3_model_load_internal(
            LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
            ggml_cuda_set_scratch_size(0); // disable scratch
        } else {
-            const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
+            const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type);
-            const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
+            const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type);
            vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
            ggml_cuda_set_scratch_size(vram_scratch);
            if (n_gpu_layers > 0) {
@ -1872,10 +1873,10 @@ static bool llama_v3_eval_internal(
            ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
        }
    } else {
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
    }
 #else
-    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+    llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
 #if GGML_USE_MPI
@ -1939,7 +1940,7 @@ static bool llama_v3_eval_internal(
 // tokenizer
 //
-static size_t utf8_len(char src) {
+static size_t utf8_len3(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
@ -1980,7 +1981,7 @@ struct llama_v3_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_v3_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -2076,6 +2077,24 @@ private:
    llama_v3_sp_bigram::queue work_queue_;
 };
 std::vector<llama_token> llama_v3_tokenize(
        struct llama_v3_context * ctx,
           const std::string & text,
                        bool   add_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
 }
 static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) {
    llama_v3_tokenizer tokenizer(vocab);
    std::vector<llama_v3_vocab::id> output;
@ -3010,10 +3029,10 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
    if (ggml_is_quantized(tensor.type)) {
        qtype = ggml_internal_get_type_traits(tensor.type);
        if (qtype.to_float == NULL) {
-            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
+            throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
        }
    } else if (tensor.type != GGML_TYPE_F16) {
-        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
+        throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
    }
    if (nthread < 2) {
@ -3084,7 +3103,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
        case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
        case LLAMA_V3_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
 #endif
-        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
+        default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype));
    }
    if (nthread <= 0) {
@ -3209,7 +3228,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
            if (tensor.type == GGML_TYPE_F32) {
                f32_data = (float *) tensor.data;
            } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
-                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
+                throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
            } else {
                llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
                f32_data = (float *) f32_conv_buf.addr;
@ -3348,7 +3367,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
        params.seed = time(NULL);
    }
-    size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
+    size_t blasbatchmul = get_blas_batch_mul3(params.n_batch);
    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
@ -3430,9 +3449,9 @@ struct llama_v3_context * llama_v3_new_context_with_model(
            // debug - for comparison with scratch buffer
            //size_t prev_req =
-            //    MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
+            //    MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) +
-            //    MEM_REQ_SCRATCH1().at(ctx->model.type) +
+            //    MEM_REQ_SCRATCH1_3().at(ctx->model.type) +
-            //    MEM_REQ_EVAL().at(ctx->model.type);
+            //    MEM_REQ_EVAL_3().at(ctx->model.type);
            //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
            // recreate allocator with exact memory requirements
@ -3447,12 +3466,12 @@ struct llama_v3_context * llama_v3_new_context_with_model(
 #endif
        }
 #else
-        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
+        ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead());
 #endif
 #ifdef LLAMA_V3_USE_SCRATCH
-        ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
+        ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type));
-        ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
+        ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type));
 #endif
    }
@ -3711,7 +3730,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
 #ifdef GGML_USE_CUBLAS
            if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                if (dest_t->type != GGML_TYPE_F16) {
-                    throw std::runtime_error(format(
+                    throw std::runtime_error(format_old(
                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
                }
                offload_func = ggml_cuda_assign_buffers;
@ -3791,7 +3810,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
            struct ggml_cgraph gf = ggml_build_forward(r);
-            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
+            llv3_graph_compute_helper(work_buffer, &gf, n_threads);
            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -3977,7 +3996,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
            ggml_free(cpy_ctx);
@ -4087,7 +4106,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
            ggml_free(cpy_ctx);
        }
@ -4419,8 +4438,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_intern
 void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
+    llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
-    g_state.log_callback_user_data = user_data;
+    llv3_g_state.log_callback_user_data = user_data;
 }
 #if defined(_MSC_VER) && !defined(vsnprintf)
@ -4433,12 +4452,12 @@ static void llama_v3_log_internal_v(llama_v3_log_level level, const char * forma
    char buffer[128];
    int len = vsnprintf(buffer, 128, format, args);
    if (len < 128) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
+        llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data);
    } else {
        char* buffer2 = new char[len+1];
        vsnprintf(buffer2, len+1, format, args_copy);
        buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
+        llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data);
        delete[] buffer2;
    }
    va_end(args_copy);