llama : refactor llama_model_load_internal()

This commit is contained in:
Georgi Gerganov 2023-08-22 19:30:02 +03:00
parent 8bd7f06b58
commit 3057d6a687
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

360
llama.cpp
View file

@ -208,7 +208,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES_BA
}, },
}; };
static llm_arch llama_arch_from_string(const std::string & name) { static llm_arch llm_arch_from_string(const std::string & name) {
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
if (kv.second == name) { if (kv.second == name) {
return kv.first; return kv.first;
@ -836,6 +836,9 @@ struct llama_model {
e_model type = MODEL_UNKNOWN; e_model type = MODEL_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32; llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
std::string arch = "n/a";
llama_hparams hparams; llama_hparams hparams;
llama_vocab vocab; llama_vocab vocab;
@ -1358,38 +1361,34 @@ static const char * llama_model_type_name(e_model type) {
} }
} }
static void llama_model_load_internal( static void llm_load_vocab(
llama_model_loader & ml, llama_model_loader & ml,
llama_model & model, llama_model & model) {
llama_vocab & vocab, auto & vocab = model.vocab;
int n_ctx,
int n_batch,
int n_gpu_layers,
int main_gpu,
const float * tensor_split,
const bool mul_mat_q,
float rope_freq_base,
float rope_freq_scale,
bool low_vram,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
bool vocab_only,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
model.n_gpu_layers = n_gpu_layers; struct gguf_context * ctx = ml.ctx_gguf;
auto & hparams = model.hparams; const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
std::string general_name = "n/a"; const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
std::string general_arch = "n/a"; if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n");
}
// read hparams const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n");
}
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
// determine vocab type
{ {
struct gguf_context * ctx = ml.ctx_gguf;
std::string tokenizer_name; std::string tokenizer_name;
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model"); GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
@ -1402,155 +1401,161 @@ static void llama_model_load_internal(
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
} }
// get hparams kv
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
// n_head_kv is optional, default to n_head
hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
// TODO: manually setting rope scale should override this
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 1.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
if (ropescale != 1.0f) {
rope_freq_scale = 1.0f/ropescale;
}
// get general kv
GGUF_GET_KEY(ctx, general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
GGUF_GET_KEY(ctx, general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
// special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
{
if (hparams.n_layer < 32) {
model.type = e_model::MODEL_7B;
}
} break;
}
model.ftype = ml.ftype;
hparams.n_ctx = n_ctx;
// LLaMAv2
// TODO: probably not needed
{
const auto n_gqa = hparams.n_gqa();
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B;
}
}
hparams.rope_freq_base = rope_freq_base;
hparams.rope_freq_scale = rope_freq_scale;
} }
// read vocab const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
vocab.id_to_token.resize(n_vocab);
for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
vocab.token_to_id[word] = i;
auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.type = (llama_token_type) toktypes[i];
// determine the newline token: 0x0A == 10 == '\n'
if (token_data.text == "<0x0A>") {
vocab.linefeed_id = i;
}
}
// special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
}
static void llm_load_hparams(
llama_model_loader & ml,
llama_model & model,
int n_ctx,
float rope_freq_base,
float rope_freq_scale) {
auto & hparams = model.hparams;
struct gguf_context * ctx = ml.ctx_gguf;
// get hparams kv
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
// n_head_kv is optional, default to n_head
hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
// TODO: manually setting rope scale should override this
// rope_freq_scale (inverse of the kv) is optional
float ropescale = 1.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
if (ropescale != 1.0f) {
rope_freq_scale = 1.0f/ropescale;
}
// get general kv
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
{
if (hparams.n_layer < 32) {
model.type = e_model::MODEL_7B;
}
} break;
}
model.ftype = ml.ftype;
hparams.n_ctx = n_ctx;
// LLaMAv2
// TODO: probably not needed
{ {
struct gguf_context * ctx = ml.ctx_gguf; const auto n_gqa = hparams.n_gqa();
vocab.id_to_token.resize(hparams.n_vocab); if (model.type == e_model::MODEL_65B && n_gqa == 8) {
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); model.type = e_model::MODEL_70B;
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n");
}
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n");
}
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
vocab.token_to_id[word] = i;
auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.type = (llama_token_type) toktypes[i];
// determine the newline token: 0x0A == 10 == '\n'
if (token_data.text == "<0x0A>") {
vocab.linefeed_id = i;
}
} }
} }
{ hparams.rope_freq_base = rope_freq_base;
// hparams hparams.rope_freq_scale = rope_freq_scale;
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); }
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str());
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
// general kv static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str()); const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
// special tokens // hparams
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } LLAMA_LOG_INFO("%s: arch = %s\n", __func__, model.arch.c_str());
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
} LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
if (vocab_only) { // general kv
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
return;
}
auto & ctx = model.ctx; // special tokens
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
}
static void llama_model_load_internal(
llama_model_loader & ml,
llama_model & model,
int n_batch,
int n_gpu_layers,
int main_gpu,
const float * tensor_split,
const bool mul_mat_q,
bool low_vram,
ggml_type memory_type,
bool use_mlock,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
auto & ctx = model.ctx;
auto & hparams = model.hparams;
model.n_gpu_layers = n_gpu_layers;
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;
@ -1760,7 +1765,6 @@ static void llama_model_load_internal(
static bool llama_model_load( static bool llama_model_load(
const std::string & fname, const std::string & fname,
llama_model & model, llama_model & model,
llama_vocab & vocab,
int n_ctx, int n_ctx,
int n_batch, int n_batch,
int n_gpu_layers, int n_gpu_layers,
@ -1782,14 +1786,28 @@ static bool llama_model_load(
std::string arch_name; std::string arch_name;
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture"); GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
const llm_arch arch = llama_arch_from_string(arch_name); const llm_arch arch = llm_arch_from_string(arch_name);
if (arch == LLM_ARCH_UNKNOWN) { if (arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error("unknown model architecture: '" + arch_name + "'"); throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
} }
llama_model_load_internal(*ml, model, vocab, n_ctx, n_batch, n_gpu_layers, llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, llm_load_vocab(*ml, model);
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
llm_load_print_meta(*ml, model);
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
throw std::runtime_error("vocab size mismatch");
}
if (vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return true;
}
llama_model_load_internal(*ml, model, n_batch, n_gpu_layers,
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
use_mlock, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@ -4191,7 +4209,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) { params.progress_callback, params.progress_callback_user_data)) {