gguf for llama is working
This commit is contained in:
parent
39cc83e8c9
commit
981c9131f0
3 changed files with 247 additions and 142 deletions
|
@ -14,6 +14,7 @@
|
||||||
//for easier compilation
|
//for easier compilation
|
||||||
//concat source files into one file for compilation purposes
|
//concat source files into one file for compilation purposes
|
||||||
#include "llama_v2.cpp"
|
#include "llama_v2.cpp"
|
||||||
|
#include "llama_v3.cpp"
|
||||||
#include "llama.cpp"
|
#include "llama.cpp"
|
||||||
#include "utils.cpp"
|
#include "utils.cpp"
|
||||||
#include "gptj_v1.cpp"
|
#include "gptj_v1.cpp"
|
||||||
|
@ -59,10 +60,9 @@ static mpt_model mpt_ctx_v3;
|
||||||
|
|
||||||
static rwkv_v2_context * rwkv_ctx_v2;
|
static rwkv_v2_context * rwkv_ctx_v2;
|
||||||
static rwkv_context * rwkv_ctx_v3;
|
static rwkv_context * rwkv_ctx_v3;
|
||||||
static llama_v2_context_params llama_ctx_params_v2;
|
|
||||||
static llama_context_params llama_ctx_params;
|
|
||||||
static llama_v2_context * llama_ctx_v2;
|
static llama_v2_context * llama_ctx_v2;
|
||||||
static llama_context * llama_ctx_v3;
|
static llama_v3_context * llama_ctx_v3;
|
||||||
|
static llama_context * llama_ctx_v4;
|
||||||
|
|
||||||
static gpt_params params;
|
static gpt_params params;
|
||||||
static int n_past = 0;
|
static int n_past = 0;
|
||||||
|
@ -324,9 +324,13 @@ static std::string FileFormatTokenizeID(int id, FileFormat file_format)
|
||||||
{
|
{
|
||||||
return std::string(llama_v2_token_to_str(llama_ctx_v2, id));
|
return std::string(llama_v2_token_to_str(llama_ctx_v2, id));
|
||||||
}
|
}
|
||||||
else if (file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
else if (file_format == FileFormat::GGJT_3)
|
||||||
{
|
{
|
||||||
return std::string(llama_token_to_str(llama_ctx_v3, id));
|
return std::string(llama_v3_token_to_str(llama_ctx_v3, id));
|
||||||
|
}
|
||||||
|
else if( file_format == FileFormat::GGUF_LLAMA)
|
||||||
|
{
|
||||||
|
return std::string(llama_token_to_str(llama_ctx_v4, id));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -423,8 +427,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
//newer format has bit unshuffling
|
//newer format has bit unshuffling
|
||||||
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
|
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
|
||||||
|
llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
|
||||||
llama_ctx_params_v2 = llama_v2_context_default_params();
|
|
||||||
llama_ctx_params_v2.n_ctx = inputs.max_context_length;
|
llama_ctx_params_v2.n_ctx = inputs.max_context_length;
|
||||||
//llama_ctx_params.n_parts = -1;
|
//llama_ctx_params.n_parts = -1;
|
||||||
llama_ctx_params_v2.seed = -1;
|
llama_ctx_params_v2.seed = -1;
|
||||||
|
@ -470,9 +473,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
|
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGJT_3 || file_format==FileFormat::GGUF_LLAMA)
|
else if(file_format == FileFormat::GGJT_3)
|
||||||
{
|
{
|
||||||
llama_ctx_params = llama_context_default_params();
|
llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
|
||||||
llama_ctx_params.n_ctx = inputs.max_context_length;
|
llama_ctx_params.n_ctx = inputs.max_context_length;
|
||||||
//llama_ctx_paran_parts = -1;
|
//llama_ctx_paran_parts = -1;
|
||||||
llama_ctx_params.seed = -1;
|
llama_ctx_params.seed = -1;
|
||||||
|
@ -503,7 +506,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
llama_ctx_v3 = llama_v3_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||||
|
|
||||||
if (llama_ctx_v3 == NULL)
|
if (llama_ctx_v3 == NULL)
|
||||||
{
|
{
|
||||||
|
@ -520,7 +523,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
lora_base_arg = lora_base.c_str();
|
lora_base_arg = lora_base.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_apply_lora_from_file(llama_ctx_v3,
|
int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
|
||||||
lora_filename.c_str(),
|
lora_filename.c_str(),
|
||||||
lora_base_arg,
|
lora_base_arg,
|
||||||
n_threads);
|
n_threads);
|
||||||
|
@ -533,7 +536,77 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
|
|
||||||
//determine mem per token
|
//determine mem per token
|
||||||
const std::vector<int> tmp = {1, 2, 3, 4};
|
const std::vector<int> tmp = {1, 2, 3, 4};
|
||||||
auto er = llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
|
auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||||
|
if(er!=0)
|
||||||
|
{
|
||||||
|
printf("\nLLAMA EVAL returned nonzero!\n");
|
||||||
|
}
|
||||||
|
return ModelLoadResult::SUCCESS;
|
||||||
|
}
|
||||||
|
else if(file_format==FileFormat::GGUF_LLAMA)
|
||||||
|
{
|
||||||
|
llama_context_params llama_ctx_params = llama_context_default_params();
|
||||||
|
llama_ctx_params.n_ctx = inputs.max_context_length;
|
||||||
|
//llama_ctx_paran_parts = -1;
|
||||||
|
llama_ctx_params.seed = -1;
|
||||||
|
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||||
|
llama_ctx_params.low_vram = inputs.low_vram;
|
||||||
|
llama_ctx_params.mul_mat_q = inputs.use_mmq;
|
||||||
|
llama_ctx_params.logits_all = false;
|
||||||
|
llama_ctx_params.use_mmap = inputs.use_mmap;
|
||||||
|
llama_ctx_params.use_mlock = inputs.use_mlock;
|
||||||
|
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
|
||||||
|
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
|
||||||
|
llama_ctx_params.rope_freq_base = rope_freq_base;
|
||||||
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
|
llama_ctx_params.n_batch = blasbatchsize;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
bool ts_all_zero = true;
|
||||||
|
for (int i = 0; i < tensor_split_max; ++i) {
|
||||||
|
if (inputs.tensor_split[i] != 0.0f) {
|
||||||
|
ts_all_zero = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(!ts_all_zero)
|
||||||
|
{
|
||||||
|
llama_ctx_params.tensor_split = inputs.tensor_split;
|
||||||
|
printf("CUBLAS: Applying Custom Tensor Split!\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
llama_ctx_v4 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||||
|
|
||||||
|
if (llama_ctx_v4 == NULL)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
|
||||||
|
return ModelLoadResult::FAIL;
|
||||||
|
}
|
||||||
|
if (lora_filename != "")
|
||||||
|
{
|
||||||
|
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
|
||||||
|
|
||||||
|
const char * lora_base_arg = NULL;
|
||||||
|
if (lora_base != "") {
|
||||||
|
printf("Using LORA base model: %s\n", lora_base.c_str());
|
||||||
|
lora_base_arg = lora_base.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
int err = llama_apply_lora_from_file(llama_ctx_v4,
|
||||||
|
lora_filename.c_str(),
|
||||||
|
lora_base_arg,
|
||||||
|
n_threads);
|
||||||
|
if (err != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
return ModelLoadResult::FAIL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//determine mem per token
|
||||||
|
const std::vector<int> tmp = {1, 2, 3, 4};
|
||||||
|
auto er = llama_eval(llama_ctx_v4, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||||
if(er!=0)
|
if(er!=0)
|
||||||
{
|
{
|
||||||
printf("\nLLAMA EVAL returned nonzero!\n");
|
printf("\nLLAMA EVAL returned nonzero!\n");
|
||||||
|
@ -949,7 +1022,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
|
|
||||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
||||||
{
|
{
|
||||||
params.prompt.insert(0, 1, ' ');
|
|
||||||
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
|
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
|
||||||
{
|
{
|
||||||
embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
||||||
|
@ -958,9 +1030,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
||||||
}
|
}
|
||||||
|
else if (file_format == FileFormat::GGJT_3)
|
||||||
|
{
|
||||||
|
embd_inp = ::llama_v3_tokenize(llama_ctx_v3, params.prompt, true);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true);
|
embd_inp = ::llama_tokenize(llama_ctx_v4, params.prompt, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1067,9 +1143,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
n_vocab = llama_v2_n_vocab(llama_ctx_v2);
|
n_vocab = llama_v2_n_vocab(llama_ctx_v2);
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
else if(file_format == FileFormat::GGJT_3)
|
||||||
{
|
{
|
||||||
n_vocab = llama_n_vocab(llama_ctx_v3);
|
n_vocab = llama_v3_n_vocab(llama_ctx_v3);
|
||||||
|
}
|
||||||
|
else if(file_format == FileFormat::GGUF_LLAMA)
|
||||||
|
{
|
||||||
|
n_vocab = llama_n_vocab(llama_ctx_v4);
|
||||||
}
|
}
|
||||||
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
|
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
|
||||||
{
|
{
|
||||||
|
@ -1214,9 +1294,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
|
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
else if(file_format == FileFormat::GGJT_3)
|
||||||
{
|
{
|
||||||
evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
|
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||||
|
}
|
||||||
|
else if(file_format == FileFormat::GGUF_LLAMA)
|
||||||
|
{
|
||||||
|
evalres = (llama_eval(llama_ctx_v4, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
|
else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
|
||||||
{
|
{
|
||||||
|
@ -1324,28 +1408,33 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
int btsize = banned_token_ids.size();
|
int btsize = banned_token_ids.size();
|
||||||
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
||||||
{
|
{
|
||||||
if(file_format == FileFormat::GGJT_3 || file_format == FileFormat::GGUF_LLAMA)
|
if(file_format == FileFormat::GGUF_LLAMA)
|
||||||
{
|
{
|
||||||
logitsPtr = llama_get_logits(llama_ctx_v3);
|
logitsPtr = llama_get_logits(llama_ctx_v4);
|
||||||
|
eosID = llama_token_eos(llama_ctx_v4);
|
||||||
|
}
|
||||||
|
else if(file_format == FileFormat::GGJT_3)
|
||||||
|
{
|
||||||
|
logitsPtr = llama_v3_get_logits(llama_ctx_v3);
|
||||||
|
eosID = llama_v3_token_eos();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
logitsPtr = llama_v2_get_logits(llama_ctx_v2);
|
logitsPtr = llama_v2_get_logits(llama_ctx_v2);
|
||||||
|
eosID = llama_v3_token_eos();
|
||||||
}
|
}
|
||||||
|
|
||||||
eosID = llama_token_eos(llama_ctx_v3);
|
|
||||||
|
|
||||||
if (!unbanTokens)
|
if (!unbanTokens)
|
||||||
{
|
{
|
||||||
// set the logit of the eos token (2) to zero to avoid sampling it
|
// set the logit of the eos token (2) to -INF to avoid sampling it
|
||||||
logitsPtr[eosID] = 0;
|
logitsPtr[eosID] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(btsize>0)
|
if(btsize>0)
|
||||||
{
|
{
|
||||||
for(int t=0;t<btsize;++t)
|
for(int t=0;t<btsize;++t)
|
||||||
{
|
{
|
||||||
logitsPtr[banned_token_ids[t]]=0;
|
logitsPtr[banned_token_ids[t]]=-INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1369,8 +1458,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
eosID = 50256;
|
eosID = 50256;
|
||||||
if(logits.size() > eosID)
|
if(logits.size() > eosID)
|
||||||
{
|
{
|
||||||
int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
|
logits[eosID] = -INFINITY;
|
||||||
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1378,8 +1466,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
|
if (file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPT2_4)
|
||||||
{
|
{
|
||||||
eosID = 0;
|
eosID = 0;
|
||||||
int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
|
logits[eosID] = -INFINITY;
|
||||||
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1397,17 +1484,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
file_format == FileFormat::MPT_1)
|
file_format == FileFormat::MPT_1)
|
||||||
{
|
{
|
||||||
eosID = 0;
|
eosID = 0;
|
||||||
int topid = std::min_element(logits.begin(),logits.end())-logits.begin();
|
logits[eosID] = -INFINITY;
|
||||||
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(btsize>0)
|
if(btsize>0)
|
||||||
{
|
{
|
||||||
int topid = std::min_element(logits.begin(), logits.end()) - logits.begin();
|
|
||||||
for (int t = 0; t < btsize; ++t)
|
for (int t = 0; t < btsize; ++t)
|
||||||
{
|
{
|
||||||
logits[banned_token_ids[t]] = (logits[topid] < 0 ? logits[topid] : 0);
|
logits[banned_token_ids[t]] = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ enum FileFormat
|
||||||
GGJT=3, // 3=(llama ggjt)
|
GGJT=3, // 3=(llama ggjt)
|
||||||
GGJT_2=4, //newer llama format unshuffled
|
GGJT_2=4, //newer llama format unshuffled
|
||||||
GGJT_3=5, //using 16bit scalar
|
GGJT_3=5, //using 16bit scalar
|
||||||
|
GGUF_LLAMA=6, //GGUF (llama newest ver)
|
||||||
|
|
||||||
GPTJ_1=100, //the very first super old GPTJ format
|
GPTJ_1=100, //the very first super old GPTJ format
|
||||||
GPTJ_2=101, //pygmalion, uses old ggml lib
|
GPTJ_2=101, //pygmalion, uses old ggml lib
|
||||||
|
@ -47,7 +48,7 @@ enum FileFormat
|
||||||
MPT_1=500, //first supported mpt version
|
MPT_1=500, //first supported mpt version
|
||||||
|
|
||||||
|
|
||||||
GGUF_LLAMA=1000, //GGUF (llama newest ver)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ModelLoadResult
|
enum ModelLoadResult
|
||||||
|
|
|
@ -74,18 +74,18 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
|
||||||
|
|
||||||
|
|
||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model3 {
|
||||||
MODEL_UNKNOWN,
|
MODEL_UNKNOWN_3,
|
||||||
MODEL_3B,
|
MODEL_3B_3,
|
||||||
MODEL_7B,
|
MODEL_7B_3,
|
||||||
MODEL_13B,
|
MODEL_13B_3,
|
||||||
MODEL_30B,
|
MODEL_30B_3,
|
||||||
MODEL_65B,
|
MODEL_65B_3,
|
||||||
MODEL_70B,
|
MODEL_70B_3,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t kB = 1024;
|
static const size_t kB3 = 1024;
|
||||||
static const size_t MB = 1024*1024;
|
static const size_t MB3 = 1024*1024;
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
// TODO: dynamically determine these sizes
|
// TODO: dynamically determine these sizes
|
||||||
|
@ -101,7 +101,7 @@ void llama_v3_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
// ggml helpers
|
// ggml helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void llv3_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
|
@ -112,76 +112,77 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||||
ggml_graph_compute(graph, &plan);
|
ggml_graph_compute(graph, &plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// memory sizes (calculated for n_batch == 512)
|
// memory sizes (calculated for n_batch == 512)
|
||||||
//
|
//
|
||||||
|
|
||||||
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
static std::map<e_model3, size_t> MEM_REQ_SCRATCH0_3(int n_ctx)
|
||||||
{
|
{
|
||||||
std::map<e_model, size_t> k_sizes = {
|
std::map<e_model3, size_t> k_sizes = {
|
||||||
{ MODEL_3B, ((size_t) n_ctx / 16ull + 156ull) * MB },
|
{ MODEL_3B_3, ((size_t) n_ctx / 16ull + 156ull) * MB3 },
|
||||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 164ull) * MB },
|
{ MODEL_7B_3, ((size_t) n_ctx / 16ull + 164ull) * MB3 },
|
||||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 184ull) * MB },
|
{ MODEL_13B_3, ((size_t) n_ctx / 12ull + 184ull) * MB3 },
|
||||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 224ull) * MB },
|
{ MODEL_30B_3, ((size_t) n_ctx / 9ull + 224ull) * MB3 },
|
||||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 320ull) * MB }, // guess
|
{ MODEL_65B_3, ((size_t) n_ctx / 6ull + 320ull) * MB3 }, // guess
|
||||||
{ MODEL_70B, ((size_t) n_ctx / 7ull + 320ull) * MB },
|
{ MODEL_70B_3, ((size_t) n_ctx / 7ull + 320ull) * MB3 },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
static const std::map<e_model3, size_t> & MEM_REQ_SCRATCH1_3()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model3, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 192ull * MB },
|
{ MODEL_3B_3, 192ull * MB3 },
|
||||||
{ MODEL_7B, 224ull * MB },
|
{ MODEL_7B_3, 224ull * MB3 },
|
||||||
{ MODEL_13B, 256ull * MB },
|
{ MODEL_13B_3, 256ull * MB3 },
|
||||||
{ MODEL_30B, 320ull * MB },
|
{ MODEL_30B_3, 320ull * MB3 },
|
||||||
{ MODEL_65B, 448ull * MB }, // guess
|
{ MODEL_65B_3, 448ull * MB3 }, // guess
|
||||||
{ MODEL_70B, 448ull * MB },
|
{ MODEL_70B_3, 448ull * MB3 },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// used to store the compute graph tensors + non-scratch data
|
// used to store the compute graph tensors + non-scratch data
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
static const std::map<e_model3, size_t> & MEM_REQ_EVAL_3()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model3, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 16ull * MB },
|
{ MODEL_3B_3, 16ull * MB3 },
|
||||||
{ MODEL_7B, 20ull * MB },
|
{ MODEL_7B_3, 20ull * MB3 },
|
||||||
{ MODEL_13B, 24ull * MB },
|
{ MODEL_13B_3, 24ull * MB3 },
|
||||||
{ MODEL_30B, 32ull * MB },
|
{ MODEL_30B_3, 32ull * MB3 },
|
||||||
{ MODEL_65B, 48ull * MB }, // guess
|
{ MODEL_65B_3, 48ull * MB3 }, // guess
|
||||||
{ MODEL_70B, 48ull * MB },
|
{ MODEL_70B_3, 48ull * MB3 },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// amount of VRAM needed per batch size to hold temporary results
|
// amount of VRAM needed per batch size to hold temporary results
|
||||||
// the values for 3b are not derived from testing but instead chosen conservatively
|
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_BASE_3()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model3, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 512ull * kB },
|
{ MODEL_3B_3, 512ull * kB3 },
|
||||||
{ MODEL_7B, 512ull * kB },
|
{ MODEL_7B_3, 512ull * kB3 },
|
||||||
{ MODEL_13B, 640ull * kB },
|
{ MODEL_13B_3, 640ull * kB3 },
|
||||||
{ MODEL_30B, 768ull * kB },
|
{ MODEL_30B_3, 768ull * kB3 },
|
||||||
{ MODEL_65B, 1360ull * kB },
|
{ MODEL_65B_3, 1360ull * kB3 },
|
||||||
{ MODEL_70B, 1360ull * kB },
|
{ MODEL_70B_3, 1360ull * kB3 },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// amount of VRAM needed per batch size and context to hold temporary results
|
// amount of VRAM needed per batch size and context to hold temporary results
|
||||||
// the values for 3b are not derived from testing but instead chosen conservatively
|
// the values for 3b are not derived from testing but instead chosen conservatively
|
||||||
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
static const std::map<e_model3, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT_3()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model3, size_t> k_sizes = {
|
||||||
{ MODEL_3B, 128ull },
|
{ MODEL_3B_3, 128ull },
|
||||||
{ MODEL_7B, 128ull },
|
{ MODEL_7B_3, 128ull },
|
||||||
{ MODEL_13B, 160ull },
|
{ MODEL_13B_3, 160ull },
|
||||||
{ MODEL_30B, 208ull },
|
{ MODEL_30B_3, 208ull },
|
||||||
{ MODEL_65B, 320ull },
|
{ MODEL_65B_3, 320ull },
|
||||||
{ MODEL_70B, 320ull },
|
{ MODEL_70B_3, 320ull },
|
||||||
};
|
};
|
||||||
return k_sizes;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
@ -288,7 +289,7 @@ struct llama_v3_vocab {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_v3_model {
|
struct llama_v3_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model3 type = MODEL_UNKNOWN_3;
|
||||||
|
|
||||||
llama_v3_hparams hparams;
|
llama_v3_hparams hparams;
|
||||||
|
|
||||||
|
@ -452,13 +453,13 @@ struct llama_v3_state {
|
||||||
void * log_callback_user_data = nullptr;
|
void * log_callback_user_data = nullptr;
|
||||||
};
|
};
|
||||||
// global state
|
// global state
|
||||||
static llama_v3_state g_state;
|
static llama_v3_state llv3_g_state;
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T checked_mul(T a, T b) {
|
static T checked_mul(T a, T b) {
|
||||||
T ret = a * b;
|
T ret = a * b;
|
||||||
if (a != 0 && ret / a != b) {
|
if (a != 0 && ret / a != b) {
|
||||||
throw std::runtime_error(format("overflow multiplying %llu * %llu",
|
throw std::runtime_error(format_old("overflow multiplying %llu * %llu",
|
||||||
(unsigned long long) a, (unsigned long long) b));
|
(unsigned long long) a, (unsigned long long) b));
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -466,7 +467,7 @@ static T checked_mul(T a, T b) {
|
||||||
|
|
||||||
static size_t checked_div(size_t a, size_t b) {
|
static size_t checked_div(size_t a, size_t b) {
|
||||||
if (b == 0 || a % b != 0) {
|
if (b == 0 || a % b != 0) {
|
||||||
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
|
throw std::runtime_error(format_old("error dividing %zu / %zu", a, b));
|
||||||
}
|
}
|
||||||
return a / b;
|
return a / b;
|
||||||
}
|
}
|
||||||
|
@ -550,7 +551,7 @@ struct llama_v3_file_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw std::runtime_error(format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
throw std::runtime_error(format_old("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
|
||||||
magic, version));
|
magic, version));
|
||||||
}
|
}
|
||||||
void read_hparams() {
|
void read_hparams() {
|
||||||
|
@ -593,7 +594,7 @@ struct llama_v3_file_loader {
|
||||||
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
file.read_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * n_dims);
|
||||||
std::string name = file.read_string(name_len);
|
std::string name = file.read_string(name_len);
|
||||||
if (n_dims < 1 || n_dims > 2) {
|
if (n_dims < 1 || n_dims > 2) {
|
||||||
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
throw std::runtime_error(format_old("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims));
|
||||||
}
|
}
|
||||||
switch (tensor.type) {
|
switch (tensor.type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
|
@ -610,7 +611,7 @@ struct llama_v3_file_loader {
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
break;
|
break;
|
||||||
default: {
|
default: {
|
||||||
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
throw std::runtime_error(format_old("unrecognized tensor type %u\n", tensor.type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -721,11 +722,11 @@ struct llama_v3_model_loader {
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
||||||
}
|
}
|
||||||
llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second);
|
llama_v3_load_tensor & lt = tensors_map.tensors.at(it->second);
|
||||||
if (lt.ne != ne) {
|
if (lt.ne != ne) {
|
||||||
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
throw std::runtime_error(format_old("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
||||||
name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str()));
|
name.c_str(), llama_v3_format_tensor_shape(ne).c_str(), llama_v3_format_tensor_shape(lt.ne).c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -869,7 +870,7 @@ static bool kv_cache_init(
|
||||||
const int64_t n_mem = n_layer*n_ctx;
|
const int64_t n_mem = n_layer*n_ctx;
|
||||||
const int64_t n_elements = n_embd*n_mem;
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB3);
|
||||||
cache.n = 0;
|
cache.n = 0;
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
|
@ -952,7 +953,7 @@ bool llama_v3_mlock_supported() {
|
||||||
return llama_v3_mlock::SUPPORTED;
|
return llama_v3_mlock::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_blas_batch_mul(int batch)
|
int get_blas_batch_mul3(int batch)
|
||||||
{
|
{
|
||||||
return (batch>512?(batch>1024?4:2):1);
|
return (batch>512?(batch>1024?4:2):1);
|
||||||
}
|
}
|
||||||
|
@ -1027,14 +1028,14 @@ const char * llama_v3_ftype_name(enum llama_v3_ftype ftype) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * llama_v3_model_type_name(e_model type) {
|
static const char * llama_v3_model_type_name(e_model3 type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B_3: return "3B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B_3: return "7B";
|
||||||
case MODEL_13B: return "13B";
|
case MODEL_13B_3: return "13B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B_3: return "30B";
|
||||||
case MODEL_65B: return "65B";
|
case MODEL_65B_3: return "65B";
|
||||||
case MODEL_70B: return "70B";
|
case MODEL_70B_3: return "70B";
|
||||||
default: LLAMA_V3_ASSERT(false);
|
default: LLAMA_V3_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1062,7 +1063,7 @@ static void llama_v3_model_load_internal(
|
||||||
void * progress_callback_user_data) {
|
void * progress_callback_user_data) {
|
||||||
|
|
||||||
model.t_start_us = ggml_time_us();
|
model.t_start_us = ggml_time_us();
|
||||||
size_t blasbatchmul = get_blas_batch_mul(n_batch);
|
size_t blasbatchmul = get_blas_batch_mul3(n_batch);
|
||||||
|
|
||||||
std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
|
std::unique_ptr<llama_v3_model_loader> ml(new llama_v3_model_loader(fname, use_mmap));
|
||||||
|
|
||||||
|
@ -1078,15 +1079,15 @@ static void llama_v3_model_load_internal(
|
||||||
|
|
||||||
{
|
{
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model3::MODEL_3B_3; break;
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
case 32: model.type = e_model3::MODEL_7B_3; break;
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
case 40: model.type = e_model3::MODEL_13B_3; break;
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
case 60: model.type = e_model3::MODEL_30B_3; break;
|
||||||
case 80: model.type = e_model::MODEL_65B; break;
|
case 80: model.type = e_model3::MODEL_65B_3; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
if (hparams.n_layer < 32) {
|
if (hparams.n_layer < 32) {
|
||||||
model.type = e_model::MODEL_7B;
|
model.type = e_model3::MODEL_7B_3;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -1096,15 +1097,15 @@ static void llama_v3_model_load_internal(
|
||||||
// LLaMAv2
|
// LLaMAv2
|
||||||
// TODO: temporary until GGUF
|
// TODO: temporary until GGUF
|
||||||
//patch for llama2 gqa
|
//patch for llama2 gqa
|
||||||
if (model.type == e_model::MODEL_65B && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
|
if (model.type == e_model3::MODEL_65B_3 && (hparams.n_mult >= 4096 && hparams.n_mult != 5504)) {
|
||||||
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
|
fprintf(stderr, "%s: Applying KCPP Patch for 70B model, setting GQA to 8\n", __func__);
|
||||||
n_gqa = 8;
|
n_gqa = 8;
|
||||||
}
|
}
|
||||||
LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0);
|
LLAMA_V3_ASSERT(hparams.n_head % n_gqa == 0);
|
||||||
hparams.n_head_kv = hparams.n_head / n_gqa;
|
hparams.n_head_kv = hparams.n_head / n_gqa;
|
||||||
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
if (model.type == e_model3::MODEL_65B_3 && n_gqa == 8) {
|
||||||
LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
LLAMA_V3_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||||
model.type = e_model::MODEL_70B;
|
model.type = e_model3::MODEL_70B_3;
|
||||||
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1180,7 +1181,7 @@ static void llama_v3_model_load_internal(
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
if (!model.ctx) {
|
if (!model.ctx) {
|
||||||
throw std::runtime_error(format("ggml_init() failed"));
|
throw std::runtime_error(format_old("ggml_init() failed"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1289,9 +1290,9 @@ static void llama_v3_model_load_internal(
|
||||||
|
|
||||||
#ifndef LLAMA_V3_USE_ALLOCATOR
|
#ifndef LLAMA_V3_USE_ALLOCATOR
|
||||||
mem_required +=
|
mem_required +=
|
||||||
blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(model.type) +
|
||||||
blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) +
|
blasbatchmul*MEM_REQ_SCRATCH1_3().at(model.type) +
|
||||||
blasbatchmul*MEM_REQ_EVAL().at(model.type);
|
blasbatchmul*MEM_REQ_EVAL_3().at(model.type);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// this is the memory required by one llama_v3_state
|
// this is the memory required by one llama_v3_state
|
||||||
|
@ -1308,8 +1309,8 @@ static void llama_v3_model_load_internal(
|
||||||
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
LLAMA_V3_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
||||||
ggml_cuda_set_scratch_size(0); // disable scratch
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
||||||
} else {
|
} else {
|
||||||
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
|
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE_3().at(model.type);
|
||||||
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
|
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT_3().at(model.type);
|
||||||
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
|
||||||
ggml_cuda_set_scratch_size(vram_scratch);
|
ggml_cuda_set_scratch_size(vram_scratch);
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
|
@ -1872,10 +1873,10 @@ static bool llama_v3_eval_internal(
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
llv3_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_MPI
|
#if GGML_USE_MPI
|
||||||
|
@ -1939,7 +1940,7 @@ static bool llama_v3_eval_internal(
|
||||||
// tokenizer
|
// tokenizer
|
||||||
//
|
//
|
||||||
|
|
||||||
static size_t utf8_len(char src) {
|
static size_t utf8_len3(char src) {
|
||||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||||
return lookup[highbits];
|
return lookup[highbits];
|
||||||
|
@ -1980,7 +1981,7 @@ struct llama_v3_tokenizer {
|
||||||
size_t offs = 0;
|
size_t offs = 0;
|
||||||
while (offs < text.size()) {
|
while (offs < text.size()) {
|
||||||
llama_v3_sp_symbol sym;
|
llama_v3_sp_symbol sym;
|
||||||
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
size_t char_len = std::min(text.size() - offs, utf8_len3(text[offs]));
|
||||||
sym.text = text.c_str() + offs;
|
sym.text = text.c_str() + offs;
|
||||||
sym.n = char_len;
|
sym.n = char_len;
|
||||||
offs += char_len;
|
offs += char_len;
|
||||||
|
@ -2076,6 +2077,24 @@ private:
|
||||||
llama_v3_sp_bigram::queue work_queue_;
|
llama_v3_sp_bigram::queue work_queue_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::vector<llama_token> llama_v3_tokenize(
|
||||||
|
struct llama_v3_context * ctx,
|
||||||
|
const std::string & text,
|
||||||
|
bool add_bos) {
|
||||||
|
// upper limit for the number of tokens
|
||||||
|
int n_tokens = text.length() + add_bos;
|
||||||
|
std::vector<llama_token> result(n_tokens);
|
||||||
|
n_tokens = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||||
|
if (n_tokens < 0) {
|
||||||
|
result.resize(-n_tokens);
|
||||||
|
int check = llama_v3_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||||
|
GGML_ASSERT(check == -n_tokens);
|
||||||
|
} else {
|
||||||
|
result.resize(n_tokens);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) {
|
static std::vector<llama_v3_vocab::id> llama_v3_tokenize(const llama_v3_vocab & vocab, const std::string & text, bool bos) {
|
||||||
llama_v3_tokenizer tokenizer(vocab);
|
llama_v3_tokenizer tokenizer(vocab);
|
||||||
std::vector<llama_v3_vocab::id> output;
|
std::vector<llama_v3_vocab::id> output;
|
||||||
|
@ -3010,10 +3029,10 @@ static void llama_v3_convert_tensor_internal(const llama_v3_load_tensor & tensor
|
||||||
if (ggml_is_quantized(tensor.type)) {
|
if (ggml_is_quantized(tensor.type)) {
|
||||||
qtype = ggml_internal_get_type_traits(tensor.type);
|
qtype = ggml_internal_get_type_traits(tensor.type);
|
||||||
if (qtype.to_float == NULL) {
|
if (qtype.to_float == NULL) {
|
||||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format_old("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
||||||
}
|
}
|
||||||
} else if (tensor.type != GGML_TYPE_F16) {
|
} else if (tensor.type != GGML_TYPE_F16) {
|
||||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format_old("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
|
@ -3084,7 +3103,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
|
||||||
case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_V3_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_V3_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
#endif
|
#endif
|
||||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
default: throw std::runtime_error(format_old("invalid output file type %d\n", ftype));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nthread <= 0) {
|
if (nthread <= 0) {
|
||||||
|
@ -3209,7 +3228,7 @@ static void llama_v3_model_quantize_internal(const std::string & fname_inp, cons
|
||||||
if (tensor.type == GGML_TYPE_F32) {
|
if (tensor.type == GGML_TYPE_F32) {
|
||||||
f32_data = (float *) tensor.data;
|
f32_data = (float *) tensor.data;
|
||||||
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
||||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format_old("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
||||||
} else {
|
} else {
|
||||||
llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
llama_v3_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
||||||
f32_data = (float *) f32_conv_buf.addr;
|
f32_data = (float *) f32_conv_buf.addr;
|
||||||
|
@ -3348,7 +3367,7 @@ struct llama_v3_context * llama_v3_new_context_with_model(
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t blasbatchmul = get_blas_batch_mul(params.n_batch);
|
size_t blasbatchmul = get_blas_batch_mul3(params.n_batch);
|
||||||
|
|
||||||
unsigned cur_percentage = 0;
|
unsigned cur_percentage = 0;
|
||||||
if (params.progress_callback == NULL) {
|
if (params.progress_callback == NULL) {
|
||||||
|
@ -3430,9 +3449,9 @@ struct llama_v3_context * llama_v3_new_context_with_model(
|
||||||
|
|
||||||
// debug - for comparison with scratch buffer
|
// debug - for comparison with scratch buffer
|
||||||
//size_t prev_req =
|
//size_t prev_req =
|
||||||
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
// MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type) +
|
||||||
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
// MEM_REQ_SCRATCH1_3().at(ctx->model.type) +
|
||||||
// MEM_REQ_EVAL().at(ctx->model.type);
|
// MEM_REQ_EVAL_3().at(ctx->model.type);
|
||||||
//LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
//LLAMA_V3_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
||||||
|
|
||||||
// recreate allocator with exact memory requirements
|
// recreate allocator with exact memory requirements
|
||||||
|
@ -3447,12 +3466,12 @@ struct llama_v3_context * llama_v3_new_context_with_model(
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL_3().at(ctx->model.type) + ggml_graph_overhead());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LLAMA_V3_USE_SCRATCH
|
#ifdef LLAMA_V3_USE_SCRATCH
|
||||||
ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0_3(hparams.n_ctx).at(ctx->model.type));
|
||||||
ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type));
|
ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1_3().at(ctx->model.type));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3711,7 +3730,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
if (dest_t->type != GGML_TYPE_F16) {
|
if (dest_t->type != GGML_TYPE_F16) {
|
||||||
throw std::runtime_error(format(
|
throw std::runtime_error(format_old(
|
||||||
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
||||||
}
|
}
|
||||||
offload_func = ggml_cuda_assign_buffers;
|
offload_func = ggml_cuda_assign_buffers;
|
||||||
|
@ -3791,7 +3810,7 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
llv3_graph_compute_helper(work_buffer, &gf, n_threads);
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
|
@ -3977,7 +3996,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
|
|
||||||
|
@ -4087,7 +4106,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||||
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
@ -4419,8 +4438,8 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_v3_intern
|
||||||
|
|
||||||
|
|
||||||
void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) {
|
void llama_v3_log_set(llama_v3_log_callback log_callback, void * user_data) {
|
||||||
g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
|
llv3_g_state.log_callback = log_callback ? log_callback : llama_v3_log_callback_default;
|
||||||
g_state.log_callback_user_data = user_data;
|
llv3_g_state.log_callback_user_data = user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_MSC_VER) && !defined(vsnprintf)
|
#if defined(_MSC_VER) && !defined(vsnprintf)
|
||||||
|
@ -4433,12 +4452,12 @@ static void llama_v3_log_internal_v(llama_v3_log_level level, const char * forma
|
||||||
char buffer[128];
|
char buffer[128];
|
||||||
int len = vsnprintf(buffer, 128, format, args);
|
int len = vsnprintf(buffer, 128, format, args);
|
||||||
if (len < 128) {
|
if (len < 128) {
|
||||||
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
llv3_g_state.log_callback(level, buffer, llv3_g_state.log_callback_user_data);
|
||||||
} else {
|
} else {
|
||||||
char* buffer2 = new char[len+1];
|
char* buffer2 = new char[len+1];
|
||||||
vsnprintf(buffer2, len+1, format, args_copy);
|
vsnprintf(buffer2, len+1, format, args_copy);
|
||||||
buffer2[len] = 0;
|
buffer2[len] = 0;
|
||||||
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
llv3_g_state.log_callback(level, buffer2, llv3_g_state.log_callback_user_data);
|
||||||
delete[] buffer2;
|
delete[] buffer2;
|
||||||
}
|
}
|
||||||
va_end(args_copy);
|
va_end(args_copy);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue