gguf for llama is working

This commit is contained in:
Concedo 2023-08-23 16:07:07 +08:00
parent 39cc83e8c9
commit 981c9131f0
3 changed files with 247 additions and 142 deletions

View file

@ -14,6 +14,7 @@
//for easier compilation //for easier compilation
//concat source files into one file for compilation purposes //concat source files into one file for compilation purposes
#include "llama_v2.cpp" #include "llama_v2.cpp"
#include "llama_v3.cpp"
#include "llama.cpp" #include "llama.cpp"
#include "utils.cpp" #include "utils.cpp"
#include "gptj_v1.cpp" #include "gptj_v1.cpp"
@ -59,10 +60,9 @@ static mpt_model mpt_ctx_v3;
static rwkv_v2_context * rwkv_ctx_v2; static rwkv_v2_context * rwkv_ctx_v2;
static rwkv_context * rwkv_ctx_v3; static rwkv_context * rwkv_ctx_v3;
static llama_v2_context_params llama_ctx_params_v2;
static llama_context_params llama_ctx_params;
static llama_v2_context * llama_ctx_v2; static llama_v2_context * llama_ctx_v2;
static llama_context * llama_ctx_v3; static llama_v3_context * llama_ctx_v3;
static llama_context * llama_ctx_v4;
static gpt_params params; static gpt_params params;
static int n_past = 0; static int n_past = 0;
@ -324,9 +324,13 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
{ {
return std::string(llama_v2_token_to_str(llama_ctx_v2, id)); return std::string(llama_v2_token_to_str(llama_ctx_v2, id));
} }
else if (file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) else if (file_format == FileFormat::GGJT_3)
{ {
return std::string(llama_token_to_str(llama_ctx_v3, id)); return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
}
else if( file_format == FileFormat::GGUF_LLAMA)
{
return std::string(llama_token_to_str(llama_ctx_v4, id));
} }
else else
{ {
@ -423,8 +427,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{ {
//newer format has bit unshuffling //newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2); SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
llama_ctx_params_v2 = llama_v2_context_default_params();
llama_ctx_params_v2.n_ctx = inputs.max_context_length; llama_ctx_params_v2.n_ctx = inputs.max_context_length;
//llama_ctx_params.n_parts = -1; //llama_ctx_params.n_parts = -1;
llama_ctx_params_v2.seed = -1; llama_ctx_params_v2.seed = -1;
@ -470,9 +473,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads); llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
return ModelLoadResult::SUCCESS; return ModelLoadResult::SUCCESS;
} }
else if(file_format == FileFormat::GGJT_3 || file_format==FileFormat::GGUF_LLAMA) else if(file_format == FileFormat::GGJT_3)
{ {
llama_ctx_params = llama_context_default_params(); llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
llama_ctx_params.n_ctx = inputs.max_context_length; llama_ctx_params.n_ctx = inputs.max_context_length;
//llama_ctx_paran_parts = -1; //llama_ctx_paran_parts = -1;
llama_ctx_params.seed = -1; llama_ctx_params.seed = -1;
@ -503,7 +506,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
} }
#endif #endif
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params);
if (llama_ctx_v3 == NULL) if (llama_ctx_v3 == NULL)
{ {
@ -520,7 +523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
lora_base_arg = lora_base.c_str(); lora_base_arg = lora_base.c_str();
} }
int err = llama_apply_lora_from_file(llama_ctx_v3, int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
lora_filename.c_str(), lora_filename.c_str(),
lora_base_arg, lora_base_arg,
n_threads); n_threads);
@ -533,7 +536,77 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//determine mem per token //determine mem per token
const std::vector<int> tmp = {1, 2, 3, 4}; const std::vector<int> tmp = {1, 2, 3, 4};
auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
if(er!=0)
{
printf("\nLLAMA EVAL returned nonzero!\n");
}
return ModelLoadResult::SUCCESS;
}
else if(file_format==FileFormat::GGUF_LLAMA)
{
llama_context_params llama_ctx_params = llama_context_default_params();
llama_ctx_params.n_ctx = inputs.max_context_length;
//llama_ctx_paran_parts = -1;
llama_ctx_params.seed = -1;
llama_ctx_params.f16_kv = inputs.f16_kv;
llama_ctx_params.low_vram = inputs.low_vram;
llama_ctx_params.mul_mat_q = inputs.use_mmq;
llama_ctx_params.logits_all = false;
llama_ctx_params.use_mmap = inputs.use_mmap;
llama_ctx_params.use_mlock = inputs.use_mlock;
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
llama_ctx_params.rope_freq_base = rope_freq_base;
llama_ctx_params.rope_freq_scale = rope_freq_scale;
llama_ctx_params.n_batch = blasbatchsize;
#if defined(GGML_USE_CUBLAS)
bool ts_all_zero = true;
for (int i = 0; i < tensor_split_max; ++i) {
if (inputs.tensor_split[i] != 0.0f) {
ts_all_zero = false;
break;
}
}
if(!ts_all_zero)
{
llama_ctx_params.tensor_split = inputs.tensor_split;
printf("CUBLAS: Applying Custom Tensor Split!\n");
}
#endif
llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
if (llama_ctx_v4 == NULL)
{
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
return ModelLoadResult::FAIL;
}
if (lora_filename != "")
{
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
const char * lora_base_arg = NULL;
if (lora_base != "") {
printf("Using LORA base model: %s\n", lora_base.c_str());
lora_base_arg = lora_base.c_str();
}
int err = llama_apply_lora_from_file(llama_ctx_v4,
lora_filename.c_str(),
lora_base_arg,
n_threads);
if (err != 0)
{
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return ModelLoadResult::FAIL;
}
}
//determine mem per token
const std::vector<int> tmp = {1, 2, 3, 4};
auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads);
if(er!=0) if(er!=0)
{ {
printf("\nLLAMA EVAL returned nonzero!\n"); printf("\nLLAMA EVAL returned nonzero!\n");
@ -949,7 +1022,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
{ {
params.prompt.insert(0, 1, ' ');
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
{ {
embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true); embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
@ -958,9 +1030,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true); embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
} }
else if (file_format == FileFormat::GGJT_3)
{
embd_inp = ::llama_v3_tokenize(llama_ctx_v3, params.prompt, true);
}
else else
{ {
embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true); embd_inp = ::llama_tokenize(llama_ctx_v4, params.prompt, true);
} }
} }
else else
@ -1067,9 +1143,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
n_vocab = llama_v2_n_vocab(llama_ctx_v2); n_vocab = llama_v2_n_vocab(llama_ctx_v2);
} }
else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) else if(file_format == FileFormat::GGJT_3)
{ {
n_vocab = llama_n_vocab(llama_ctx_v3); n_vocab = llama_v3_n_vocab(llama_ctx_v3);
}
else if(file_format == FileFormat::GGUF_LLAMA)
{
n_vocab = llama_n_vocab(llama_ctx_v4);
} }
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2) else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
{ {
@ -1214,9 +1294,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0); evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
} }
else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) else if(file_format == FileFormat::GGJT_3)
{ {
evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
}
else if(file_format == FileFormat::GGUF_LLAMA)
{
evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0);
} }
else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2) else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
{ {
@ -1324,28 +1408,33 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
int btsize = banned_token_ids.size(); int btsize = banned_token_ids.size();
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
{ {
if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA) if(file_format == FileFormat::GGUF_LLAMA)
{ {
logitsPtr = llama_get_logits(llama_ctx_v3); logitsPtr = llama_get_logits(llama_ctx_v4);
eosID = llama_token_eos(llama_ctx_v4);
}
else if(file_format == FileFormat::GGJT_3)
{
logitsPtr = llama_v3_get_logits(llama_ctx_v3);
eosID = llama_v3_token_eos();
} }
else else
{ {
logitsPtr = llama_v2_get_logits(llama_ctx_v2); logitsPtr = llama_v2_get_logits(llama_ctx_v2);
eosID = llama_v3_token_eos();
} }
eosID = llama_token_eos(llama_ctx_v3);
if (!unbanTokens) if (!unbanTokens)
{ {
// set the logit of the eos token (2) to zero to avoid sampling it // set the logit of the eos token (2) to -INF to avoid sampling it
logitsPtr[eosID] = 0; logitsPtr[eosID] = -INFINITY;
} }
if(btsize>0) if(btsize>0)
{ {
for(int t=0;t<btsize;++t) for(int t=0;t<btsize;++t)
{ {
logitsPtr[banned_token_ids[t]]=0; logitsPtr[banned_token_ids[t]]=-INFINITY;
} }
} }
} }
@ -1369,8 +1458,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
eosID = 50256; eosID = 50256;
if(logits.size() > eosID) if(logits.size() > eosID)
{ {
int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); logits[eosID] = -INFINITY;
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
} }
else else
{ {
@ -1378,8 +1466,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4) if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
{ {
eosID = 0; eosID = 0;
int topid = std::min_element(logits.begin(), logits.end()) - logits.begin(); logits[eosID] = -INFINITY;
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
} }
} }
} }
@ -1397,17 +1484,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
file_format == FileFormat::MPT_1) file_format == FileFormat::MPT_1)
{ {
eosID = 0; eosID = 0;
int topid = std::min_element(logits.begin(),logits.end())-logits.begin(); logits[eosID] = -INFINITY;
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
} }
} }
if(btsize>0) if(btsize>0)
{ {
int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
for (int t = 0; t < btsize; ++t) for (int t = 0; t < btsize; ++t)
{ {
logits[banned_token_ids[t]] = (logits[topid] < 0 ? logits[topid] : 0); logits[banned_token_ids[t]] = -INFINITY;
} }
} }
} }

View file

@ -21,6 +21,7 @@ enum FileFormat
GGJT=3, // 3=(llama ggjt) GGJT=3, // 3=(llama ggjt)
GGJT_2=4, //newer llama format unshuffled GGJT_2=4, //newer llama format unshuffled
GGJT_3=5, //using 16bit scalar GGJT_3=5, //using 16bit scalar
GGUF_LLAMA=6, //GGUF (llama newest ver)
GPTJ_1=100, //the very first super old GPTJ format GPTJ_1=100, //the very first super old GPTJ format
GPTJ_2=101, //pygmalion, uses old ggml lib GPTJ_2=101, //pygmalion, uses old ggml lib
@ -47,7 +48,7 @@ enum FileFormat
MPT_1=500, //first supported mpt version MPT_1=500, //first supported mpt version
GGUF_LLAMA=1000, //GGUF (llama newest ver)
}; };
enum ModelLoadResult enum ModelLoadResult

View file

@ -74,18 +74,18 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
// available llama models // available llama models
enum e_model { enum e_model3 {
MODEL_UNKNOWN, MODEL_UNKNOWN_3,
MODEL_3B, MODEL_3B_3,
MODEL_7B, MODEL_7B_3,
MODEL_13B, MODEL_13B_3,
MODEL_30B, MODEL_30B_3,
MODEL_65B, MODEL_65B_3,
MODEL_70B, MODEL_70B_3,
}; };
static const size_t kB = 1024; static const size_t kB3 = 1024;
static const size_t MB = 1024*1024; static const size_t MB3 = 1024*1024;
// computed for n_ctx == 2048 // computed for n_ctx == 2048
// TODO: dynamically determine these sizes // TODO: dynamically determine these sizes
@ -101,7 +101,7 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
// ggml helpers // ggml helpers
// //
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) { if (plan.work_size > 0) {
@ -112,76 +112,77 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
ggml_graph_compute(graph, &plan); ggml_graph_compute(graph, &plan);
} }
// //
// memory sizes (calculated for n_batch == 512) // memory sizes (calculated for n_batch == 512)
// //
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx) static std::map<e_model3, size_t> MEM_REQ_SCRATCH0_3(int n_ctx)
{ {
std::map<e_model, size_t> k_sizes = { std::map<e_model3, size_t> k_sizes = {
{ MODEL_3B, ((size_t) n_ctx / 16ull + 156ull) * MB }, { MODEL_3B_3, ((size_t) n_ctx / 16ull + 156ull) * MB3 },
{ MODEL_7B, ((size_t) n_ctx / 16ull + 164ull) * MB }, { MODEL_7B_3, ((size_t) n_ctx / 16ull + 164ull) * MB3 },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 184ull) * MB }, { MODEL_13B_3, ((size_t) n_ctx / 12ull + 184ull) * MB3 },
{ MODEL_30B, ((size_t) n_ctx / 9ull + 224ull) * MB }, { MODEL_30B_3, ((size_t) n_ctx / 9ull + 224ull) * MB3 },
{ MODEL_65B, ((size_t) n_ctx / 6ull + 320ull) * MB }, // guess { MODEL_65B_3, ((size_t) n_ctx / 6ull + 320ull) * MB3 }, // guess
{ MODEL_70B, ((size_t) n_ctx / 7ull + 320ull) * MB }, { MODEL_70B_3, ((size_t) n_ctx / 7ull + 320ull) * MB3 },
}; };
return k_sizes; return k_sizes;
} }
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1() static const std::map<e_model3, size_t> & MEM_REQ_SCRATCH1_3()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model3, size_t> k_sizes = {
{ MODEL_3B, 192ull * MB }, { MODEL_3B_3, 192ull * MB3 },
{ MODEL_7B, 224ull * MB }, { MODEL_7B_3, 224ull * MB3 },
{ MODEL_13B, 256ull * MB }, { MODEL_13B_3, 256ull * MB3 },
{ MODEL_30B, 320ull * MB }, { MODEL_30B_3, 320ull * MB3 },
{ MODEL_65B, 448ull * MB }, // guess { MODEL_65B_3, 448ull * MB3 }, // guess
{ MODEL_70B, 448ull * MB }, { MODEL_70B_3, 448ull * MB3 },
}; };
return k_sizes; return k_sizes;
} }
// used to store the compute graph tensors + non-scratch data // used to store the compute graph tensors + non-scratch data
static const std::map<e_model, size_t> & MEM_REQ_EVAL() static const std::map<e_model3, size_t> & MEM_REQ_EVAL_3()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model3, size_t> k_sizes = {
{ MODEL_3B, 16ull * MB }, { MODEL_3B_3, 16ull * MB3 },
{ MODEL_7B, 20ull * MB }, { MODEL_7B_3, 20ull * MB3 },
{ MODEL_13B, 24ull * MB }, { MODEL_13B_3, 24ull * MB3 },
{ MODEL_30B, 32ull * MB }, { MODEL_30B_3, 32ull * MB3 },
{ MODEL_65B, 48ull * MB }, // guess { MODEL_65B_3, 48ull * MB3 }, // guess
{ MODEL_70B, 48ull * MB }, { MODEL_70B_3, 48ull * MB3 },
}; };
return k_sizes; return k_sizes;
} }
// amount of VRAM needed per batch size to hold temporary results // amount of VRAM needed per batch size to hold temporary results
// the values for 3b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE() static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_BASE_3()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model3, size_t> k_sizes = {
{ MODEL_3B, 512ull * kB }, { MODEL_3B_3, 512ull * kB3 },
{ MODEL_7B, 512ull * kB }, { MODEL_7B_3, 512ull * kB3 },
{ MODEL_13B, 640ull * kB }, { MODEL_13B_3, 640ull * kB3 },
{ MODEL_30B, 768ull * kB }, { MODEL_30B_3, 768ull * kB3 },
{ MODEL_65B, 1360ull * kB }, { MODEL_65B_3, 1360ull * kB3 },
{ MODEL_70B, 1360ull * kB }, { MODEL_70B_3, 1360ull * kB3 },
}; };
return k_sizes; return k_sizes;
} }
// amount of VRAM needed per batch size and context to hold temporary results // amount of VRAM needed per batch size and context to hold temporary results
// the values for 3b are not derived from testing but instead chosen conservatively // the values for 3b are not derived from testing but instead chosen conservatively
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT() static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT_3()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model3, size_t> k_sizes = {
{ MODEL_3B, 128ull }, { MODEL_3B_3, 128ull },
{ MODEL_7B, 128ull }, { MODEL_7B_3, 128ull },
{ MODEL_13B, 160ull }, { MODEL_13B_3, 160ull },
{ MODEL_30B, 208ull }, { MODEL_30B_3, 208ull },
{ MODEL_65B, 320ull }, { MODEL_65B_3, 320ull },
{ MODEL_70B, 320ull }, { MODEL_70B_3, 320ull },
}; };
return k_sizes; return k_sizes;
} }
@ -288,7 +289,7 @@ struct llama_v3_vocab {
}; };
struct llama_v3_model { struct llama_v3_model {
e_model type = MODEL_UNKNOWN; e_model3 type = MODEL_UNKNOWN_3;
llama_v3_hparams hparams; llama_v3_hparams hparams;
@ -452,13 +453,13 @@ struct llama_v3_state {
void * log_callback_user_data = nullptr; void * log_callback_user_data = nullptr;
}; };
// global state // global state
static llama_v3_state g_state; static llama_v3_state llv3_g_state;
template <typename T> template <typename T>
static T checked_mul(T a, T b) { static T checked_mul(T a, T b) {
T ret = a * b; T ret = a * b;
if (a != 0 && ret / a != b) { if (a != 0 && ret / a != b) {
throw std::runtime_error(format("overflow multiplying %llu * %llu", throw std::runtime_error(format_old("overflow multiplying %llu * %llu",
(unsigned long long) a, (unsigned long long) b)); (unsigned long long) a, (unsigned long long) b));
} }
return ret; return ret;
@ -466,7 +467,7 @@ static T checked_mul(T a, T b) {
static size_t checked_div(size_t a, size_t b) { static size_t checked_div(size_t a, size_t b) {
if (b == 0 || a % b != 0) { if (b == 0 || a % b != 0) {
throw std::runtime_error(format("error dividing %zu / %zu", a, b)); throw std::runtime_error(format_old("error dividing %zu / %zu", a, b));
} }
return a / b; return a / b;
} }
@ -550,7 +551,7 @@ struct llama_v3_file_loader {
} }
} }
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
magic, version)); magic, version));
} }
void read_hparams() { void read_hparams() {
@ -593,7 +594,7 @@ struct llama_v3_file_loader {
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims); file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
std::string name = file.read_string(name_len); std::string name = file.read_string(name_len);
if (n_dims < 1 || n_dims > 2) { if (n_dims < 1 || n_dims > 2) {
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims)); throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
} }
switch (tensor.type) { switch (tensor.type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
@ -610,7 +611,7 @@ struct llama_v3_file_loader {
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
break; break;
default: { default: {
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type)); throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type));
} }
} }
@ -721,11 +722,11 @@ struct llama_v3_model_loader {
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) { struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name); auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) { if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
} }
llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second); llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) { if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str())); name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str()));
} }
@ -869,7 +870,7 @@ static bool kv_cache_init(
const int64_t n_mem = n_layer*n_ctx; const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem; const int64_t n_elements = n_embd*n_mem;
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3);
cache.n = 0; cache.n = 0;
struct ggml_init_params params; struct ggml_init_params params;
@ -952,7 +953,7 @@ bool llama_v3_mlock_supported() {
return llama_v3_mlock::SUPPORTED; return llama_v3_mlock::SUPPORTED;
} }
int get_blas_batch_mul(int batch) int get_blas_batch_mul3(int batch)
{ {
return (batch>512?(batch>1024?4:2):1); return (batch>512?(batch>1024?4:2):1);
} }
@ -1027,14 +1028,14 @@ const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) {
} }
} }
static const char * llama_v3_model_type_name(e_model type) { static const char * llama_v3_model_type_name(e_model3 type) {
switch (type) { switch (type) {
case MODEL_3B: return "3B"; case MODEL_3B_3: return "3B";
case MODEL_7B: return "7B"; case MODEL_7B_3: return "7B";
case MODEL_13B: return "13B"; case MODEL_13B_3: return "13B";
case MODEL_30B: return "30B"; case MODEL_30B_3: return "30B";
case MODEL_65B: return "65B"; case MODEL_65B_3: return "65B";
case MODEL_70B: return "70B"; case MODEL_70B_3: return "70B";
default: LLAMA_V3_ASSERT(false); default: LLAMA_V3_ASSERT(false);
} }
} }
@ -1062,7 +1063,7 @@ static void llama_v3_model_load_internal(
void * progress_callback_user_data) { void * progress_callback_user_data) {
model.t_start_us = ggml_time_us(); model.t_start_us = ggml_time_us();
size_t blasbatchmul = get_blas_batch_mul(n_batch); size_t blasbatchmul = get_blas_batch_mul3(n_batch);
std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap)); std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
@ -1078,15 +1079,15 @@ static void llama_v3_model_load_internal(
{ {
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model3::MODEL_3B_3; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model3::MODEL_7B_3; break;
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model3::MODEL_13B_3; break;
case 60: model.type = e_model::MODEL_30B; break; case 60: model.type = e_model3::MODEL_30B_3; break;
case 80: model.type = e_model::MODEL_65B; break; case 80: model.type = e_model3::MODEL_65B_3; break;
default: default:
{ {
if (hparams.n_layer < 32) { if (hparams.n_layer < 32) {
model.type = e_model::MODEL_7B; model.type = e_model3::MODEL_7B_3;
} }
} break; } break;
} }
@ -1096,15 +1097,15 @@ static void llama_v3_model_load_internal(
// LLaMAv2 // LLaMAv2
// TODO: temporary until GGUF // TODO: temporary until GGUF
//patch for llama2 gqa //patch for llama2 gqa
if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) { if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__); fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
n_gqa = 8; n_gqa = 8;
} }
LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0); LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0);
hparams.n_head_kv = hparams.n_head / n_gqa; hparams.n_head_kv = hparams.n_head / n_gqa;
if (model.type == e_model::MODEL_65B && n_gqa == 8) { if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) {
LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B; model.type = e_model3::MODEL_70B_3;
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
} }
@ -1180,7 +1181,7 @@ static void llama_v3_model_load_internal(
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
throw std::runtime_error(format("ggml_init() failed")); throw std::runtime_error(format_old("ggml_init() failed"));
} }
} }
@ -1289,9 +1290,9 @@ static void llama_v3_model_load_internal(
#ifndef LLAMA_V3_USE_ALLOCATOR #ifndef LLAMA_V3_USE_ALLOCATOR
mem_required += mem_required +=
blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) +
blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) + blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) +
blasbatchmul*MEM_REQ_EVAL().at(model.type); blasbatchmul*MEM_REQ_EVAL_3().at(model.type);
#endif #endif
// this is the memory required by one llama_v3_state // this is the memory required by one llama_v3_state
@ -1308,8 +1309,8 @@ static void llama_v3_model_load_internal(
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch ggml_cuda_set_scratch_size(0); // disable scratch
} else { } else {
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type); const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type);
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type); const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type);
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context); vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
ggml_cuda_set_scratch_size(vram_scratch); ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) { if (n_gpu_layers > 0) {
@ -1872,10 +1873,10 @@ static bool llama_v3_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, embeddings); ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
} }
} else { } else {
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
} }
#else #else
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
#endif #endif
#if GGML_USE_MPI #if GGML_USE_MPI
@ -1939,7 +1940,7 @@ static bool llama_v3_eval_internal(
// tokenizer // tokenizer
// //
static size_t utf8_len(char src) { static size_t utf8_len3(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4; uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits]; return lookup[highbits];
@ -1980,7 +1981,7 @@ struct llama_v3_tokenizer {
size_t offs = 0; size_t offs = 0;
while (offs < text.size()) { while (offs < text.size()) {
llama_v3_sp_symbol sym; llama_v3_sp_symbol sym;
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs]));
sym.text = text.c_str() + offs; sym.text = text.c_str() + offs;
sym.n = char_len; sym.n = char_len;
offs += char_len; offs += char_len;
@ -2076,6 +2077,24 @@ private:
llama_v3_sp_bigram::queue work_queue_; llama_v3_sp_bigram::queue work_queue_;
}; };
std::vector<llama_token> llama_v3_tokenize(
struct llama_v3_context * ctx,
const std::string & text,
bool add_bos) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) { static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) {
llama_v3_tokenizer tokenizer(vocab); llama_v3_tokenizer tokenizer(vocab);
std::vector<llama_v3_vocab::id> output; std::vector<llama_v3_vocab::id> output;
@ -3010,10 +3029,10 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
if (ggml_is_quantized(tensor.type)) { if (ggml_is_quantized(tensor.type)) {
qtype = ggml_internal_get_type_traits(tensor.type); qtype = ggml_internal_get_type_traits(tensor.type);
if (qtype.to_float == NULL) { if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
} }
} else if (tensor.type != GGML_TYPE_F16) { } else if (tensor.type != GGML_TYPE_F16) {
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
} }
if (nthread < 2) { if (nthread < 2) {
@ -3084,7 +3103,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
#endif #endif
default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype));
} }
if (nthread <= 0) { if (nthread <= 0) {
@ -3209,7 +3228,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
if (tensor.type == GGML_TYPE_F32) { if (tensor.type == GGML_TYPE_F32) {
f32_data = (float *) tensor.data; f32_data = (float *) tensor.data;
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
} else { } else {
llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
f32_data = (float *) f32_conv_buf.addr; f32_data = (float *) f32_conv_buf.addr;
@ -3348,7 +3367,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
params.seed = time(NULL); params.seed = time(NULL);
} }
size_t blasbatchmul = get_blas_batch_mul(params.n_batch); size_t blasbatchmul = get_blas_batch_mul3(params.n_batch);
unsigned cur_percentage = 0; unsigned cur_percentage = 0;
if (params.progress_callback == NULL) { if (params.progress_callback == NULL) {
@ -3430,9 +3449,9 @@ struct llama_v3_context * llama_v3_new_context_with_model(
// debug - for comparison with scratch buffer // debug - for comparison with scratch buffer
//size_t prev_req = //size_t prev_req =
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) + // MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) +
// MEM_REQ_SCRATCH1().at(ctx->model.type) + // MEM_REQ_SCRATCH1_3().at(ctx->model.type) +
// MEM_REQ_EVAL().at(ctx->model.type); // MEM_REQ_EVAL_3().at(ctx->model.type);
//LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0); //LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements // recreate allocator with exact memory requirements
@ -3447,12 +3466,12 @@ struct llama_v3_context * llama_v3_new_context_with_model(
#endif #endif
} }
#else #else
ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead());
#endif #endif
#ifdef LLAMA_V3_USE_SCRATCH #ifdef LLAMA_V3_USE_SCRATCH
ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type)); ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type));
#endif #endif
} }
@ -3711,7 +3730,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
if (dest_t->type != GGML_TYPE_F16) { if (dest_t->type != GGML_TYPE_F16) {
throw std::runtime_error(format( throw std::runtime_error(format_old(
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__)); "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
} }
offload_func = ggml_cuda_assign_buffers; offload_func = ggml_cuda_assign_buffers;
@ -3791,7 +3810,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
struct ggml_cgraph gf = ggml_build_forward(r); struct ggml_cgraph gf = ggml_build_forward(r);
ggml_graph_compute_helper(work_buffer, &gf, n_threads); llv3_graph_compute_helper(work_buffer, &gf, n_threads);
// we won't need these tensors again, reset the context to save memory // we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx); ggml_free(lora_ctx);
@ -3977,7 +3996,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
ggml_free(cpy_ctx); ggml_free(cpy_ctx);
@ -4087,7 +4106,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
ggml_free(cpy_ctx); ggml_free(cpy_ctx);
} }
@ -4419,8 +4438,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_intern
void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) { void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) {
g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default; llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
g_state.log_callback_user_data = user_data; llv3_g_state.log_callback_user_data = user_data;
} }
#if defined(_MSC_VER) && !defined(vsnprintf) #if defined(_MSC_VER) && !defined(vsnprintf)
@ -4433,12 +4452,12 @@ static void llama_v3_log_internal_v(llama_v3_log_level level, const char * forma
char buffer[128]; char buffer[128];
int len = vsnprintf(buffer, 128, format, args); int len = vsnprintf(buffer, 128, format, args);
if (len < 128) { if (len < 128) {
g_state.log_callback(level, buffer, g_state.log_callback_user_data); llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data);
} else { } else {
char* buffer2 = new char[len+1]; char* buffer2 = new char[len+1];
vsnprintf(buffer2, len+1, format, args_copy); vsnprintf(buffer2, len+1, format, args_copy);
buffer2[len] = 0; buffer2[len] = 0;
g_state.log_callback(level, buffer2, g_state.log_callback_user_data); llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data);
delete[] buffer2; delete[] buffer2;
} }
va_end(args_copy); va_end(args_copy);