From 981c9131f0f20c10099735c1e353534b5bfe1e59 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 23 Aug 2023 16:07:07 +0800 Subject: [PATCH] gguf for llama is working --- gpttype_adapter.cpp | 151 ++++++++++++++++++++------ model_adapter.h | 3 +- otherarch/llama_v3.cpp | 235 ++++++++++++++++++++++------------------- 3 files changed, 247 insertions(+), 142 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index a4449521e..9eb7ede2c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -14,6 +14,7 @@ //for easier compilation //concat source files into one file for compilation purposes #include "llama_v2.cpp" +#include "llama_v3.cpp" #include "llama.cpp" #include "utils.cpp" #include "gptj_v1.cpp" @@ -59,10 +60,9 @@ static mpt_model mpt_ctx_v3; static rwkv_v2_context * rwkv_ctx_v2; static rwkv_context * rwkv_ctx_v3; -static llama_v2_context_params llama_ctx_params_v2; -static llama_context_params llama_ctx_params; static llama_v2_context * llama_ctx_v2; -static llama_context * llama_ctx_v3; +static llama_v3_context * llama_ctx_v3; +static llama_context * llama_ctx_v4; static gpt_params params; static int n_past = 0; @@ -324,9 +324,13 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format) { return std::string(llama_v2_token_to_str(llama_ctx_v2, id)); } - else if (file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + else if (file_format == FileFormat::GGJT_3) { - return std::string(llama_token_to_str(llama_ctx_v3, id)); + return std::string(llama_v3_token_to_str(llama_ctx_v3, id)); + } + else if( file_format == FileFormat::GGUF_LLAMA) + { + return std::string(llama_token_to_str(llama_ctx_v4, id)); } else { @@ -423,8 +427,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GGJT_2); - - llama_ctx_params_v2 = llama_v2_context_default_params(); + llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params(); llama_ctx_params_v2.n_ctx = inputs.max_context_length; //llama_ctx_params.n_parts = -1; llama_ctx_params_v2.seed = -1; @@ -470,9 +473,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads); return ModelLoadResult::SUCCESS; } - else if(file_format == FileFormat::GGJT_3 || file_format==FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGJT_3) { - llama_ctx_params = llama_context_default_params(); + llama_v3_context_params llama_ctx_params = llama_v3_context_default_params(); llama_ctx_params.n_ctx = inputs.max_context_length; //llama_ctx_paran_parts = -1; llama_ctx_params.seed = -1; @@ -503,7 +506,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } #endif - llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params); if (llama_ctx_v3 == NULL) { @@ -520,7 +523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in lora_base_arg = lora_base.c_str(); } - int err = llama_apply_lora_from_file(llama_ctx_v3, + int err = llama_v3_apply_lora_from_file(llama_ctx_v3, lora_filename.c_str(), lora_base_arg, n_threads); @@ -533,7 +536,77 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //determine mem per token const std::vector tmp = {1, 2, 3, 4}; - auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); + auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); + if(er!=0) + { + printf("\nLLAMA EVAL returned nonzero!\n"); + } + return ModelLoadResult::SUCCESS; + } + else if(file_format==FileFormat::GGUF_LLAMA) + { + llama_context_params llama_ctx_params = llama_context_default_params(); + llama_ctx_params.n_ctx = inputs.max_context_length; + //llama_ctx_paran_parts = -1; + llama_ctx_params.seed = -1; + llama_ctx_params.f16_kv = inputs.f16_kv; + llama_ctx_params.low_vram = inputs.low_vram; + llama_ctx_params.mul_mat_q = inputs.use_mmq; + llama_ctx_params.logits_all = false; + llama_ctx_params.use_mmap = inputs.use_mmap; + llama_ctx_params.use_mlock = inputs.use_mlock; + llama_ctx_params.n_gpu_layers = inputs.gpulayers; + llama_ctx_params.main_gpu = cu_parseinfo_maindevice; + llama_ctx_params.rope_freq_base = rope_freq_base; + llama_ctx_params.rope_freq_scale = rope_freq_scale; + llama_ctx_params.n_batch = blasbatchsize; + + #if defined(GGML_USE_CUBLAS) + bool ts_all_zero = true; + for (int i = 0; i < tensor_split_max; ++i) { + if (inputs.tensor_split[i] != 0.0f) { + ts_all_zero = false; + break; + } + } + if(!ts_all_zero) + { + llama_ctx_params.tensor_split = inputs.tensor_split; + printf("CUBLAS: Applying Custom Tensor Split!\n"); + } + #endif + + llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + + if (llama_ctx_v4 == NULL) + { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str()); + return ModelLoadResult::FAIL; + } + if (lora_filename != "") + { + printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str()); + + const char * lora_base_arg = NULL; + if (lora_base != "") { + printf("Using LORA base model: %s\n", lora_base.c_str()); + lora_base_arg = lora_base.c_str(); + } + + int err = llama_apply_lora_from_file(llama_ctx_v4, + lora_filename.c_str(), + lora_base_arg, + n_threads); + if (err != 0) + { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return ModelLoadResult::FAIL; + } + } + + //determine mem per token + const std::vector tmp = {1, 2, 3, 4}; + auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads); if(er!=0) { printf("\nLLAMA EVAL returned nonzero!\n"); @@ -949,7 +1022,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) { - params.prompt.insert(0, 1, ' '); if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) { embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true); @@ -958,9 +1030,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true); } + else if (file_format == FileFormat::GGJT_3) + { + embd_inp = ::llama_v3_tokenize(llama_ctx_v3, params.prompt, true); + } else { - embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true); + embd_inp = ::llama_tokenize(llama_ctx_v4, params.prompt, true); } } else @@ -1067,9 +1143,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = llama_v2_n_vocab(llama_ctx_v2); } - else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGJT_3) { - n_vocab = llama_n_vocab(llama_ctx_v3); + n_vocab = llama_v3_n_vocab(llama_ctx_v3); + } + else if(file_format == FileFormat::GGUF_LLAMA) + { + n_vocab = llama_n_vocab(llama_ctx_v4); } else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2) { @@ -1214,9 +1294,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0); } - else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + else if(file_format == FileFormat::GGJT_3) { - evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); + evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); + } + else if(file_format == FileFormat::GGUF_LLAMA) + { + evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0); } else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) { @@ -1324,28 +1408,33 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o int btsize = banned_token_ids.size(); if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) { - if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) + if(file_format == FileFormat::GGUF_LLAMA) { - logitsPtr = llama_get_logits(llama_ctx_v3); + logitsPtr = llama_get_logits(llama_ctx_v4); + eosID = llama_token_eos(llama_ctx_v4); + } + else if(file_format == FileFormat::GGJT_3) + { + logitsPtr = llama_v3_get_logits(llama_ctx_v3); + eosID = llama_v3_token_eos(); } else { logitsPtr = llama_v2_get_logits(llama_ctx_v2); + eosID = llama_v3_token_eos(); } - eosID = llama_token_eos(llama_ctx_v3); - if (!unbanTokens) { - // set the logit of the eos token (2) to zero to avoid sampling it - logitsPtr[eosID] = 0; + // set the logit of the eos token (2) to -INF to avoid sampling it + logitsPtr[eosID] = -INFINITY; } if(btsize>0) { for(int t=0;t eosID) { - int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); + logits[eosID] = -INFINITY; } else { @@ -1378,8 +1466,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4) { eosID = 0; - int topid = std::min_element(logits.begin(), logits.end()) - logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); + logits[eosID] = -INFINITY; } } } @@ -1397,17 +1484,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o file_format == FileFormat::MPT_1) { eosID = 0; - int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); - logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0); + logits[eosID] = -INFINITY; } } if(btsize>0) { - int topid = std::min_element(logits.begin(), logits.end()) - logits.begin(); for (int t = 0; t < btsize; ++t) { - logits[banned_token_ids[t]] = (logits[topid] < 0 ? logits[topid] : 0); + logits[banned_token_ids[t]] = -INFINITY; } } } diff --git a/model_adapter.h b/model_adapter.h index 2974d3455..f4e8a7034 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -21,6 +21,7 @@ enum FileFormat GGJT=3, // 3=(llama ggjt) GGJT_2=4, //newer llama format unshuffled GGJT_3=5, //using 16bit scalar + GGUF_LLAMA=6, //GGUF (llama newest ver) GPTJ_1=100, //the very first super old GPTJ format GPTJ_2=101, //pygmalion, uses old ggml lib @@ -47,7 +48,7 @@ enum FileFormat MPT_1=500, //first supported mpt version - GGUF_LLAMA=1000, //GGUF (llama newest ver) + }; enum ModelLoadResult diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp index f40d3e742..bfe05cd53 100644 --- a/otherarch/llama_v3.cpp +++ b/otherarch/llama_v3.cpp @@ -74,18 +74,18 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char * // available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_3B, - MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, - MODEL_70B, +enum e_model3 { + MODEL_UNKNOWN_3, + MODEL_3B_3, + MODEL_7B_3, + MODEL_13B_3, + MODEL_30B_3, + MODEL_65B_3, + MODEL_70B_3, }; -static const size_t kB = 1024; -static const size_t MB = 1024*1024; +static const size_t kB3 = 1024; +static const size_t MB3 = 1024*1024; // computed for n_ctx == 2048 // TODO: dynamically determine these sizes @@ -101,7 +101,7 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default // ggml helpers // -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void llv3_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -112,76 +112,77 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * ggml_graph_compute(graph, &plan); } + // // memory sizes (calculated for n_batch == 512) // -static std::map MEM_REQ_SCRATCH0(int n_ctx) +static std::map MEM_REQ_SCRATCH0_3(int n_ctx) { - std::map k_sizes = { - { MODEL_3B, ((size_t) n_ctx / 16ull + 156ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 16ull + 164ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 184ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 224ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 320ull) * MB }, // guess - { MODEL_70B, ((size_t) n_ctx / 7ull + 320ull) * MB }, + std::map k_sizes = { + { MODEL_3B_3, ((size_t) n_ctx / 16ull + 156ull) * MB3 }, + { MODEL_7B_3, ((size_t) n_ctx / 16ull + 164ull) * MB3 }, + { MODEL_13B_3, ((size_t) n_ctx / 12ull + 184ull) * MB3 }, + { MODEL_30B_3, ((size_t) n_ctx / 9ull + 224ull) * MB3 }, + { MODEL_65B_3, ((size_t) n_ctx / 6ull + 320ull) * MB3 }, // guess + { MODEL_70B_3, ((size_t) n_ctx / 7ull + 320ull) * MB3 }, }; return k_sizes; } -static const std::map & MEM_REQ_SCRATCH1() +static const std::map & MEM_REQ_SCRATCH1_3() { - static std::map k_sizes = { - { MODEL_3B, 192ull * MB }, - { MODEL_7B, 224ull * MB }, - { MODEL_13B, 256ull * MB }, - { MODEL_30B, 320ull * MB }, - { MODEL_65B, 448ull * MB }, // guess - { MODEL_70B, 448ull * MB }, + static std::map k_sizes = { + { MODEL_3B_3, 192ull * MB3 }, + { MODEL_7B_3, 224ull * MB3 }, + { MODEL_13B_3, 256ull * MB3 }, + { MODEL_30B_3, 320ull * MB3 }, + { MODEL_65B_3, 448ull * MB3 }, // guess + { MODEL_70B_3, 448ull * MB3 }, }; return k_sizes; } // used to store the compute graph tensors + non-scratch data -static const std::map & MEM_REQ_EVAL() +static const std::map & MEM_REQ_EVAL_3() { - static std::map k_sizes = { - { MODEL_3B, 16ull * MB }, - { MODEL_7B, 20ull * MB }, - { MODEL_13B, 24ull * MB }, - { MODEL_30B, 32ull * MB }, - { MODEL_65B, 48ull * MB }, // guess - { MODEL_70B, 48ull * MB }, + static std::map k_sizes = { + { MODEL_3B_3, 16ull * MB3 }, + { MODEL_7B_3, 20ull * MB3 }, + { MODEL_13B_3, 24ull * MB3 }, + { MODEL_30B_3, 32ull * MB3 }, + { MODEL_65B_3, 48ull * MB3 }, // guess + { MODEL_70B_3, 48ull * MB3 }, }; return k_sizes; } // amount of VRAM needed per batch size to hold temporary results // the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_BASE() +static const std::map & VRAM_REQ_SCRATCH_BASE_3() { - static std::map k_sizes = { - { MODEL_3B, 512ull * kB }, - { MODEL_7B, 512ull * kB }, - { MODEL_13B, 640ull * kB }, - { MODEL_30B, 768ull * kB }, - { MODEL_65B, 1360ull * kB }, - { MODEL_70B, 1360ull * kB }, + static std::map k_sizes = { + { MODEL_3B_3, 512ull * kB3 }, + { MODEL_7B_3, 512ull * kB3 }, + { MODEL_13B_3, 640ull * kB3 }, + { MODEL_30B_3, 768ull * kB3 }, + { MODEL_65B_3, 1360ull * kB3 }, + { MODEL_70B_3, 1360ull * kB3 }, }; return k_sizes; } // amount of VRAM needed per batch size and context to hold temporary results // the values for 3b are not derived from testing but instead chosen conservatively -static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() +static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT_3() { - static std::map k_sizes = { - { MODEL_3B, 128ull }, - { MODEL_7B, 128ull }, - { MODEL_13B, 160ull }, - { MODEL_30B, 208ull }, - { MODEL_65B, 320ull }, - { MODEL_70B, 320ull }, + static std::map k_sizes = { + { MODEL_3B_3, 128ull }, + { MODEL_7B_3, 128ull }, + { MODEL_13B_3, 160ull }, + { MODEL_30B_3, 208ull }, + { MODEL_65B_3, 320ull }, + { MODEL_70B_3, 320ull }, }; return k_sizes; } @@ -288,7 +289,7 @@ struct llama_v3_vocab { }; struct llama_v3_model { - e_model type = MODEL_UNKNOWN; + e_model3 type = MODEL_UNKNOWN_3; llama_v3_hparams hparams; @@ -452,13 +453,13 @@ struct llama_v3_state { void * log_callback_user_data = nullptr; }; // global state -static llama_v3_state g_state; +static llama_v3_state llv3_g_state; template static T checked_mul(T a, T b) { T ret = a * b; if (a != 0 && ret / a != b) { - throw std::runtime_error(format("overflow multiplying %llu * %llu", + throw std::runtime_error(format_old("overflow multiplying %llu * %llu", (unsigned long long) a, (unsigned long long) b)); } return ret; @@ -466,7 +467,7 @@ static T checked_mul(T a, T b) { static size_t checked_div(size_t a, size_t b) { if (b == 0 || a % b != 0) { - throw std::runtime_error(format("error dividing %zu / %zu", a, b)); + throw std::runtime_error(format_old("error dividing %zu / %zu", a, b)); } return a / b; } @@ -550,7 +551,7 @@ struct llama_v3_file_loader { } } - throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version)); } void read_hparams() { @@ -593,7 +594,7 @@ struct llama_v3_file_loader { file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { - throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); + throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); } switch (tensor.type) { case GGML_TYPE_F32: @@ -610,7 +611,7 @@ struct llama_v3_file_loader { case GGML_TYPE_Q6_K: break; default: { - throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type)); + throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type)); } } @@ -721,11 +722,11 @@ struct llama_v3_model_loader { struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { - throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); + throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str()))); } llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second); if (lt.ne != ne) { - throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", + throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str())); } @@ -869,7 +870,7 @@ static bool kv_cache_init( const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3); cache.n = 0; struct ggml_init_params params; @@ -952,7 +953,7 @@ bool llama_v3_mlock_supported() { return llama_v3_mlock::SUPPORTED; } -int get_blas_batch_mul(int batch) +int get_blas_batch_mul3(int batch) { return (batch>512?(batch>1024?4:2):1); } @@ -1027,14 +1028,14 @@ const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) { } } -static const char * llama_v3_model_type_name(e_model type) { +static const char * llama_v3_model_type_name(e_model3 type) { switch (type) { - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; + case MODEL_3B_3: return "3B"; + case MODEL_7B_3: return "7B"; + case MODEL_13B_3: return "13B"; + case MODEL_30B_3: return "30B"; + case MODEL_65B_3: return "65B"; + case MODEL_70B_3: return "70B"; default: LLAMA_V3_ASSERT(false); } } @@ -1062,7 +1063,7 @@ static void llama_v3_model_load_internal( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - size_t blasbatchmul = get_blas_batch_mul(n_batch); + size_t blasbatchmul = get_blas_batch_mul3(n_batch); std::unique_ptr ml(new llama_v3_model_loader(fname, use_mmap)); @@ -1078,15 +1079,15 @@ static void llama_v3_model_load_internal( { switch (hparams.n_layer) { - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; + case 26: model.type = e_model3::MODEL_3B_3; break; + case 32: model.type = e_model3::MODEL_7B_3; break; + case 40: model.type = e_model3::MODEL_13B_3; break; + case 60: model.type = e_model3::MODEL_30B_3; break; + case 80: model.type = e_model3::MODEL_65B_3; break; default: { if (hparams.n_layer < 32) { - model.type = e_model::MODEL_7B; + model.type = e_model3::MODEL_7B_3; } } break; } @@ -1096,15 +1097,15 @@ static void llama_v3_model_load_internal( // LLaMAv2 // TODO: temporary until GGUF //patch for llama2 gqa - if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { + if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__); n_gqa = 8; } LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0); hparams.n_head_kv = hparams.n_head / n_gqa; - if (model.type == e_model::MODEL_65B && n_gqa == 8) { + if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) { LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; + model.type = e_model3::MODEL_70B_3; hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model } @@ -1180,7 +1181,7 @@ static void llama_v3_model_load_internal( model.ctx = ggml_init(params); if (!model.ctx) { - throw std::runtime_error(format("ggml_init() failed")); + throw std::runtime_error(format_old("ggml_init() failed")); } } @@ -1289,9 +1290,9 @@ static void llama_v3_model_load_internal( #ifndef LLAMA_V3_USE_ALLOCATOR mem_required += - blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + - blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) + - blasbatchmul*MEM_REQ_EVAL().at(model.type); + blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) + + blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) + + blasbatchmul*MEM_REQ_EVAL_3().at(model.type); #endif // this is the memory required by one llama_v3_state @@ -1308,8 +1309,8 @@ static void llama_v3_model_load_internal( LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch } else { - const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); - const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); + const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type); + const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type); vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { @@ -1872,10 +1873,10 @@ static bool llama_v3_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, embeddings); } } else { - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads); } #else - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads); #endif #if GGML_USE_MPI @@ -1939,7 +1940,7 @@ static bool llama_v3_eval_internal( // tokenizer // -static size_t utf8_len(char src) { +static size_t utf8_len3(char src) { const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; uint8_t highbits = static_cast(src) >> 4; return lookup[highbits]; @@ -1980,7 +1981,7 @@ struct llama_v3_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_v3_sp_symbol sym; - size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -2076,6 +2077,24 @@ private: llama_v3_sp_bigram::queue work_queue_; }; +std::vector llama_v3_tokenize( + struct llama_v3_context * ctx, + const std::string & text, + bool add_bos) { + // upper limit for the number of tokens + int n_tokens = text.length() + add_bos; + std::vector result(n_tokens); + n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + return result; +} + static std::vector llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) { llama_v3_tokenizer tokenizer(vocab); std::vector output; @@ -3010,10 +3029,10 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor if (ggml_is_quantized(tensor.type)) { qtype = ggml_internal_get_type_traits(tensor.type); if (qtype.to_float == NULL) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); + throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); } } else if (tensor.type != GGML_TYPE_F16) { - throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); + throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); } if (nthread < 2) { @@ -3084,7 +3103,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; #endif - default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); + default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype)); } if (nthread <= 0) { @@ -3209,7 +3228,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons if (tensor.type == GGML_TYPE_F32) { f32_data = (float *) tensor.data; } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); + throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type))); } else { llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); f32_data = (float *) f32_conv_buf.addr; @@ -3348,7 +3367,7 @@ struct llama_v3_context * llama_v3_new_context_with_model( params.seed = time(NULL); } - size_t blasbatchmul = get_blas_batch_mul(params.n_batch); + size_t blasbatchmul = get_blas_batch_mul3(params.n_batch); unsigned cur_percentage = 0; if (params.progress_callback == NULL) { @@ -3430,9 +3449,9 @@ struct llama_v3_context * llama_v3_new_context_with_model( // debug - for comparison with scratch buffer //size_t prev_req = - // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + - // MEM_REQ_SCRATCH1().at(ctx->model.type) + - // MEM_REQ_EVAL().at(ctx->model.type); + // MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) + + // MEM_REQ_SCRATCH1_3().at(ctx->model.type) + + // MEM_REQ_EVAL_3().at(ctx->model.type); //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); // recreate allocator with exact memory requirements @@ -3447,12 +3466,12 @@ struct llama_v3_context * llama_v3_new_context_with_model( #endif } #else - ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); + ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead()); #endif #ifdef LLAMA_V3_USE_SCRATCH - ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type)); + ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type)); + ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type)); #endif } @@ -3711,7 +3730,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model, #ifdef GGML_USE_CUBLAS if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( + throw std::runtime_error(format_old( "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); } offload_func = ggml_cuda_assign_buffers; @@ -3791,7 +3810,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model, struct ggml_cgraph gf = ggml_build_forward(r); - ggml_graph_compute_helper(work_buffer, &gf, n_threads); + llv3_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3977,7 +3996,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); @@ -4087,7 +4106,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -4419,8 +4438,8 @@ const std::vector>& llama_v3_intern void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) { - g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default; - g_state.log_callback_user_data = user_data; + llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default; + llv3_g_state.log_callback_user_data = user_data; } #if defined(_MSC_VER) && !defined(vsnprintf) @@ -4433,12 +4452,12 @@ static void llama_v3_log_internal_v(llama_v3_log_level level, const char * forma char buffer[128]; int len = vsnprintf(buffer, 128, format, args); if (len < 128) { - g_state.log_callback(level, buffer, g_state.log_callback_user_data); + llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data); } else { char* buffer2 = new char[len+1]; vsnprintf(buffer2, len+1, format, args_copy); buffer2[len] = 0; - g_state.log_callback(level, buffer2, g_state.log_callback_user_data); + llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data); delete[] buffer2; } va_end(args_copy);