llama : refactor llama_model_load_internal()
This commit is contained in:
parent
8bd7f06b58
commit
3057d6a687
1 changed files with 189 additions and 171 deletions
202
llama.cpp
202
llama.cpp
|
@ -208,7 +208,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES_BA
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
static llm_arch llama_arch_from_string(const std::string & name) {
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
return kv.first;
|
return kv.first;
|
||||||
|
@ -836,6 +836,9 @@ struct llama_model {
|
||||||
e_model type = MODEL_UNKNOWN;
|
e_model type = MODEL_UNKNOWN;
|
||||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||||
|
|
||||||
|
std::string name = "n/a";
|
||||||
|
std::string arch = "n/a";
|
||||||
|
|
||||||
llama_hparams hparams;
|
llama_hparams hparams;
|
||||||
llama_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
|
@ -1358,38 +1361,34 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_model_load_internal(
|
static void llm_load_vocab(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model,
|
llama_model & model) {
|
||||||
llama_vocab & vocab,
|
auto & vocab = model.vocab;
|
||||||
int n_ctx,
|
|
||||||
int n_batch,
|
|
||||||
int n_gpu_layers,
|
|
||||||
int main_gpu,
|
|
||||||
const float * tensor_split,
|
|
||||||
const bool mul_mat_q,
|
|
||||||
float rope_freq_base,
|
|
||||||
float rope_freq_scale,
|
|
||||||
bool low_vram,
|
|
||||||
ggml_type memory_type,
|
|
||||||
bool use_mmap,
|
|
||||||
bool use_mlock,
|
|
||||||
bool vocab_only,
|
|
||||||
llama_progress_callback progress_callback,
|
|
||||||
void * progress_callback_user_data) {
|
|
||||||
model.t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
model.n_gpu_layers = n_gpu_layers;
|
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
|
||||||
|
|
||||||
std::string general_name = "n/a";
|
|
||||||
std::string general_arch = "n/a";
|
|
||||||
|
|
||||||
// read hparams
|
|
||||||
{
|
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
|
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
|
||||||
|
if (token_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
|
||||||
|
if (score_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
|
|
||||||
|
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
|
||||||
|
if (toktype_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
|
|
||||||
|
// determine vocab type
|
||||||
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
|
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
|
||||||
|
|
||||||
|
@ -1402,6 +1401,45 @@ static void llama_model_load_internal(
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||||
|
|
||||||
|
vocab.id_to_token.resize(n_vocab);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||||
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||||
|
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
|
||||||
|
auto & token_data = vocab.id_to_token[i];
|
||||||
|
token_data.text = std::move(word);
|
||||||
|
token_data.score = scores[i];
|
||||||
|
token_data.type = (llama_token_type) toktypes[i];
|
||||||
|
|
||||||
|
// determine the newline token: 0x0A == 10 == '\n'
|
||||||
|
if (token_data.text == "<0x0A>") {
|
||||||
|
vocab.linefeed_id = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// special tokens
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
|
||||||
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llm_load_hparams(
|
||||||
|
llama_model_loader & ml,
|
||||||
|
llama_model & model,
|
||||||
|
int n_ctx,
|
||||||
|
float rope_freq_base,
|
||||||
|
float rope_freq_scale) {
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
struct gguf_context * ctx = ml.ctx_gguf;
|
||||||
|
|
||||||
// get hparams kv
|
// get hparams kv
|
||||||
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
|
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
|
||||||
|
@ -1426,15 +1464,8 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
GGUF_GET_KEY(ctx, general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
|
||||||
GGUF_GET_KEY(ctx, general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
|
||||||
|
|
||||||
// special tokens
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
|
|
||||||
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
case 26: model.type = e_model::MODEL_3B; break;
|
||||||
|
@ -1469,52 +1500,13 @@ static void llama_model_load_internal(
|
||||||
hparams.rope_freq_scale = rope_freq_scale;
|
hparams.rope_freq_scale = rope_freq_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
// read vocab
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
{
|
const auto & hparams = model.hparams;
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
const auto & vocab = model.vocab;
|
||||||
|
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
|
|
||||||
if (token_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
|
|
||||||
if (score_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
|
||||||
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
|
|
||||||
if (toktype_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
|
|
||||||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
|
||||||
token_data.text = std::move(word);
|
|
||||||
token_data.score = scores[i];
|
|
||||||
token_data.type = (llama_token_type) toktypes[i];
|
|
||||||
|
|
||||||
// determine the newline token: 0x0A == 10 == '\n'
|
|
||||||
if (token_data.text == "<0x0A>") {
|
|
||||||
vocab.linefeed_id = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
// hparams
|
// hparams
|
||||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str());
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, model.arch.c_str());
|
||||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
||||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||||
|
@ -1534,7 +1526,7 @@ static void llama_model_load_internal(
|
||||||
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||||
|
|
||||||
// general kv
|
// general kv
|
||||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||||
|
@ -1545,12 +1537,25 @@ static void llama_model_load_internal(
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vocab_only) {
|
static void llama_model_load_internal(
|
||||||
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
llama_model_loader & ml,
|
||||||
return;
|
llama_model & model,
|
||||||
}
|
int n_batch,
|
||||||
|
int n_gpu_layers,
|
||||||
|
int main_gpu,
|
||||||
|
const float * tensor_split,
|
||||||
|
const bool mul_mat_q,
|
||||||
|
bool low_vram,
|
||||||
|
ggml_type memory_type,
|
||||||
|
bool use_mlock,
|
||||||
|
llama_progress_callback progress_callback,
|
||||||
|
void * progress_callback_user_data) {
|
||||||
|
model.t_start_us = ggml_time_us();
|
||||||
|
|
||||||
auto & ctx = model.ctx;
|
auto & ctx = model.ctx;
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
model.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
|
@ -1760,7 +1765,6 @@ static void llama_model_load_internal(
|
||||||
static bool llama_model_load(
|
static bool llama_model_load(
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
llama_model & model,
|
llama_model & model,
|
||||||
llama_vocab & vocab,
|
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
@ -1782,14 +1786,28 @@ static bool llama_model_load(
|
||||||
std::string arch_name;
|
std::string arch_name;
|
||||||
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
|
GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
|
||||||
|
|
||||||
const llm_arch arch = llama_arch_from_string(arch_name);
|
const llm_arch arch = llm_arch_from_string(arch_name);
|
||||||
if (arch == LLM_ARCH_UNKNOWN) {
|
if (arch == LLM_ARCH_UNKNOWN) {
|
||||||
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model_load_internal(*ml, model, vocab, n_ctx, n_batch, n_gpu_layers,
|
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
||||||
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
llm_load_vocab(*ml, model);
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
|
||||||
|
llm_load_print_meta(*ml, model);
|
||||||
|
|
||||||
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||||
|
throw std::runtime_error("vocab size mismatch");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab_only) {
|
||||||
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model_load_internal(*ml, model, n_batch, n_gpu_layers,
|
||||||
|
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
||||||
|
use_mlock, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||||
|
@ -4191,7 +4209,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
||||||
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
||||||
params.progress_callback, params.progress_callback_user_data)) {
|
params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue