llama : refactor llama_model_load_internal()
This commit is contained in:
parent
8bd7f06b58
commit
3057d6a687
1 changed files with 189 additions and 171 deletions
360
llama.cpp
360
llama.cpp
|
@ -208,7 +208,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES_BA
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
static llm_arch llama_arch_from_string(const std::string & name) {
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
return kv.first;
|
return kv.first;
|
||||||
|
@ -836,6 +836,9 @@ struct llama_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model type = MODEL_UNKNOWN;
|
||||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||||
|
|
||||||
|
std::string name = "n/a";
|
||||||
|
std::string arch = "n/a";
|
||||||
|
|
||||||
llama_hparams hparams;
|
llama_hparams hparams;
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
|
@ -1358,38 +1361,34 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llm_load_vocab(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model,
|
llama_model & model) {
|
||||||
llama_vocab & vocab,
|
auto & vocab = model.vocab;
|
||||||
int n_ctx,
|
|
||||||
int n_batch,
|
|
||||||
int n_gpu_layers,
|
|
||||||
int main_gpu,
|
|
||||||
const float * tensor_split,
|
|
||||||
const bool mul_mat_q,
|
|
||||||
float rope_freq_base,
|
|
||||||
float rope_freq_scale,
|
|
||||||
bool low_vram,
|
|
||||||
ggml_type memory_type,
|
|
||||||
bool use_mmap,
|
|
||||||
bool use_mlock,
|
|
||||||
bool vocab_only,
|
|
||||||
llama_progress_callback progress_callback,
|
|
||||||
void * progress_callback_user_data) {
|
|
||||||
model.t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
|
||||||
|
if (token_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
|
}
|
||||||
|
|
||||||
std::string general_name = "n/a";
|
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
|
||||||
std::string general_arch = "n/a";
|
if (score_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
||||||
|
}
|
||||||
|
|
||||||
// read hparams
|
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
|
|
||||||
|
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
|
||||||
|
if (toktype_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
|
|
||||||
|
// determine vocab type
|
||||||
{
|
{
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
|
||||||
|
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
|
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
|
||||||
|
|
||||||
|
@ -1402,155 +1401,161 @@ static void llama_model_load_internal(
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
}
|
}
|
||||||
|
|
||||||
// get hparams kv
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
|
|
||||||
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
|
|
||||||
|
|
||||||
// n_head_kv is optional, default to n_head
|
|
||||||
hparams.n_head_kv = hparams.n_head;
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
|
|
||||||
|
|
||||||
// TODO: manually setting rope scale should override this
|
|
||||||
// rope_freq_scale (inverse of the kv) is optional
|
|
||||||
float ropescale = 1.0f;
|
|
||||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
|
|
||||||
if (ropescale != 1.0f) {
|
|
||||||
rope_freq_scale = 1.0f/ropescale;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get general kv
|
|
||||||
GGUF_GET_KEY(ctx, general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
|
||||||
GGUF_GET_KEY(ctx, general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
|
||||||
|
|
||||||
// special tokens
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
|
||||||
case 80: model.type = e_model::MODEL_65B; break;
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
if (hparams.n_layer < 32) {
|
|
||||||
model.type = e_model::MODEL_7B;
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
model.ftype = ml.ftype;
|
|
||||||
|
|
||||||
hparams.n_ctx = n_ctx;
|
|
||||||
|
|
||||||
// LLaMAv2
|
|
||||||
// TODO: probably not needed
|
|
||||||
{
|
|
||||||
const auto n_gqa = hparams.n_gqa();
|
|
||||||
|
|
||||||
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
|
||||||
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
|
||||||
model.type = e_model::MODEL_70B;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
hparams.rope_freq_base = rope_freq_base;
|
|
||||||
hparams.rope_freq_scale = rope_freq_scale;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// read vocab
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||||
|
|
||||||
|
vocab.id_to_token.resize(n_vocab);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||||
|
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
|
||||||
|
auto & token_data = vocab.id_to_token[i];
|
||||||
|
token_data.text = std::move(word);
|
||||||
|
token_data.score = scores[i];
|
||||||
|
token_data.type = (llama_token_type) toktypes[i];
|
||||||
|
|
||||||
|
// determine the newline token: 0x0A == 10 == '\n'
|
||||||
|
if (token_data.text == "<0x0A>") {
|
||||||
|
vocab.linefeed_id = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// special tokens
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llm_load_hparams(
|
||||||
|
llama_model_loader & ml,
|
||||||
|
llama_model & model,
|
||||||
|
int n_ctx,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale) {
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
|
// get hparams kv
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
|
||||||
|
|
||||||
|
// n_head_kv is optional, default to n_head
|
||||||
|
hparams.n_head_kv = hparams.n_head;
|
||||||
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
|
||||||
|
|
||||||
|
// TODO: manually setting rope scale should override this
|
||||||
|
// rope_freq_scale (inverse of the kv) is optional
|
||||||
|
float ropescale = 1.0f;
|
||||||
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
|
||||||
|
if (ropescale != 1.0f) {
|
||||||
|
rope_freq_scale = 1.0f/ropescale;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get general kv
|
||||||
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
||||||
|
GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
case 60: model.type = e_model::MODEL_30B; break;
|
||||||
|
case 80: model.type = e_model::MODEL_65B; break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
if (hparams.n_layer < 32) {
|
||||||
|
model.type = e_model::MODEL_7B;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.ftype = ml.ftype;
|
||||||
|
|
||||||
|
hparams.n_ctx = n_ctx;
|
||||||
|
|
||||||
|
// LLaMAv2
|
||||||
|
// TODO: probably not needed
|
||||||
{
|
{
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
const auto n_gqa = hparams.n_gqa();
|
||||||
|
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
|
||||||
|
LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||||
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
|
model.type = e_model::MODEL_70B;
|
||||||
if (token_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
|
|
||||||
if (score_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
|
||||||
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
|
|
||||||
if (toktype_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
|
||||||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
|
||||||
token_data.text = std::move(word);
|
|
||||||
token_data.score = scores[i];
|
|
||||||
token_data.type = (llama_token_type) toktypes[i];
|
|
||||||
|
|
||||||
// determine the newline token: 0x0A == 10 == '\n'
|
|
||||||
if (token_data.text == "<0x0A>") {
|
|
||||||
vocab.linefeed_id = i;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
hparams.rope_freq_base = rope_freq_base;
|
||||||
// hparams
|
hparams.rope_freq_scale = rope_freq_scale;
|
||||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
}
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str());
|
|
||||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
|
||||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
||||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
|
||||||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
||||||
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
|
||||||
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
|
||||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
||||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
|
||||||
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
|
||||||
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
||||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
|
||||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
|
||||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
|
||||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
|
||||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
|
|
||||||
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
|
||||||
|
|
||||||
// general kv
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
|
const auto & hparams = model.hparams;
|
||||||
|
const auto & vocab = model.vocab;
|
||||||
|
|
||||||
// special tokens
|
// hparams
|
||||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, model.arch.c_str());
|
||||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
||||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||||
}
|
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||||
|
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
|
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
||||||
|
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
||||||
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||||
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||||
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||||
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||||
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||||
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype));
|
||||||
|
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||||
|
|
||||||
if (vocab_only) {
|
// general kv
|
||||||
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto & ctx = model.ctx;
|
// special tokens
|
||||||
|
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||||
|
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||||
|
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||||
|
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||||
|
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||||
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_model_load_internal(
|
||||||
|
llama_model_loader & ml,
|
||||||
|
llama_model & model,
|
||||||
|
int n_batch,
|
||||||
|
int n_gpu_layers,
|
||||||
|
int main_gpu,
|
||||||
|
const float * tensor_split,
|
||||||
|
const bool mul_mat_q,
|
||||||
|
bool low_vram,
|
||||||
|
ggml_type memory_type,
|
||||||
|
bool use_mlock,
|
||||||
|
llama_progress_callback progress_callback,
|
||||||
|
void * progress_callback_user_data) {
|
||||||
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
auto & ctx = model.ctx;
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
|
@ -1760,7 +1765,6 @@ static void llama_model_load_internal(
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
llama_vocab & vocab,
|
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1782,14 +1786,28 @@ static bool llama_model_load(
|
||||||
std::string arch_name;
|
std::string arch_name;
|
||||||
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
|
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
|
||||||
|
|
||||||
const llm_arch arch = llama_arch_from_string(arch_name);
|
const llm_arch arch = llm_arch_from_string(arch_name);
|
||||||
if (arch == LLM_ARCH_UNKNOWN) {
|
if (arch == LLM_ARCH_UNKNOWN) {
|
||||||
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model_load_internal(*ml, model, vocab, n_ctx, n_batch, n_gpu_layers,
|
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
||||||
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
llm_load_vocab(*ml, model);
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
|
||||||
|
llm_load_print_meta(*ml, model);
|
||||||
|
|
||||||
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||||
|
throw std::runtime_error("vocab size mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab_only) {
|
||||||
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model_load_internal(*ml, model, n_batch, n_gpu_layers,
|
||||||
|
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
||||||
|
use_mlock, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||||
|
@ -4191,7 +4209,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
||||||
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
||||||
params.progress_callback, params.progress_callback_user_data)) {
|
params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue