diff --git a/llama.cpp b/llama.cpp index 618d1773d..ef711fa11 100644 --- a/llama.cpp +++ b/llama.cpp @@ -208,7 +208,7 @@ static std::map> LLM_TENSOR_NAMES_BA }, }; -static llm_arch llama_arch_from_string(const std::string & name) { +static llm_arch llm_arch_from_string(const std::string & name) { for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT if (kv.second == name) { return kv.first; @@ -836,6 +836,9 @@ struct llama_model { e_model type = MODEL_UNKNOWN; llama_ftype ftype = LLAMA_FTYPE_ALL_F32; + std::string name = "n/a"; + std::string arch = "n/a"; + llama_hparams hparams; llama_vocab vocab; @@ -1358,38 +1361,34 @@ static const char * llama_model_type_name(e_model type) { } } -static void llama_model_load_internal( +static void llm_load_vocab( llama_model_loader & ml, - llama_model & model, - llama_vocab & vocab, - int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); + llama_model & model) { + auto & vocab = model.vocab; - model.n_gpu_layers = n_gpu_layers; + struct gguf_context * ctx = ml.ctx_gguf; - auto & hparams = model.hparams; + const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } - std::string general_name = "n/a"; - std::string general_arch = "n/a"; + const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); + if (score_idx == -1) { + throw std::runtime_error("cannot find tokenizer scores in model file\n"); + } - // read hparams + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type"); + if (toktype_idx == -1) { + throw std::runtime_error("cannot find token type list in GGUF file\n"); + } + + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + // determine vocab type { - struct gguf_context * ctx = ml.ctx_gguf; - std::string tokenizer_name; GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model"); @@ -1402,155 +1401,161 @@ static void llama_model_load_internal( LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); vocab.type = LLAMA_VOCAB_TYPE_SPM; } - - // get hparams kv - GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens"); - GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length"); - GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length"); - GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length"); - GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count"); - GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count"); - GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count"); - GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon"); - - // n_head_kv is optional, default to n_head - hparams.n_head_kv = hparams.n_head; - GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); - - // TODO: manually setting rope scale should override this - // rope_freq_scale (inverse of the kv) is optional - float ropescale = 1.0f; - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear"); - if (ropescale != 1.0f) { - rope_freq_scale = 1.0f/ropescale; - } - - // get general kv - GGUF_GET_KEY(ctx, general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name"); - GGUF_GET_KEY(ctx, general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture"); - - // special tokens - GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id"); - GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id"); - GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id"); - GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id"); - GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id"); - - switch (hparams.n_layer) { - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: - { - if (hparams.n_layer < 32) { - model.type = e_model::MODEL_7B; - } - } break; - } - - model.ftype = ml.ftype; - - hparams.n_ctx = n_ctx; - - // LLaMAv2 - // TODO: probably not needed - { - const auto n_gqa = hparams.n_gqa(); - - if (model.type == e_model::MODEL_65B && n_gqa == 8) { - LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - } - } - - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; } - // read vocab + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab.id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab.token_to_id[word] = i; + + auto & token_data = vocab.id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + + // determine the newline token: 0x0A == 10 == '\n' + if (token_data.text == "<0x0A>") { + vocab.linefeed_id = i; + } + } + + // special tokens + GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id"); + GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id"); + GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id"); + GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id"); + GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id"); +} + +static void llm_load_hparams( + llama_model_loader & ml, + llama_model & model, + int n_ctx, + float rope_freq_base, + float rope_freq_scale) { + auto & hparams = model.hparams; + + struct gguf_context * ctx = ml.ctx_gguf; + + // get hparams kv + GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens"); + GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length"); + GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length"); + GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length"); + GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count"); + GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count"); + GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count"); + GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon"); + + // n_head_kv is optional, default to n_head + hparams.n_head_kv = hparams.n_head; + GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv"); + + // TODO: manually setting rope scale should override this + // rope_freq_scale (inverse of the kv) is optional + float ropescale = 1.0f; + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear"); + if (ropescale != 1.0f) { + rope_freq_scale = 1.0f/ropescale; + } + + // get general kv + GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name"); + GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture"); + + switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = e_model::MODEL_65B; break; + default: + { + if (hparams.n_layer < 32) { + model.type = e_model::MODEL_7B; + } + } break; + } + + model.ftype = ml.ftype; + + hparams.n_ctx = n_ctx; + + // LLaMAv2 + // TODO: probably not needed { - struct gguf_context * ctx = ml.ctx_gguf; + const auto n_gqa = hparams.n_gqa(); - vocab.id_to_token.resize(hparams.n_vocab); - - const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } - - const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); - if (score_idx == -1) { - throw std::runtime_error("cannot find tokenizer scores in model file\n"); - } - - const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - - const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type"); - if (toktype_idx == -1) { - throw std::runtime_error("cannot find token type list in GGUF file\n"); - } - - const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); - - vocab.token_to_id[word] = i; - - auto & token_data = vocab.id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores[i]; - token_data.type = (llama_token_type) toktypes[i]; - - // determine the newline token: 0x0A == 10 == '\n' - if (token_data.text == "<0x0A>") { - vocab.linefeed_id = i; - } + if (model.type == e_model::MODEL_65B && n_gqa == 8) { + LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + model.type = e_model::MODEL_70B; } } - { - // hparams - LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); - LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); - LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); - LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); - LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); - LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype)); - LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; +} - // general kv - LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str()); +static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { + const auto & hparams = model.hparams; + const auto & vocab = model.vocab; - // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } - } + // hparams + LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); + LLAMA_LOG_INFO("%s: arch = %s\n", __func__, model.arch.c_str()); + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); + LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); + LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); + LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); + LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype)); + LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9); - if (vocab_only) { - LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return; - } + // general kv + LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); - auto & ctx = model.ctx; + // special tokens + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } +} + +static void llama_model_load_internal( + llama_model_loader & ml, + llama_model & model, + int n_batch, + int n_gpu_layers, + int main_gpu, + const float * tensor_split, + const bool mul_mat_q, + bool low_vram, + ggml_type memory_type, + bool use_mlock, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + model.t_start_us = ggml_time_us(); + + auto & ctx = model.ctx; + auto & hparams = model.hparams; + + model.n_gpu_layers = n_gpu_layers; size_t ctx_size; size_t mmapped_size; @@ -1760,7 +1765,6 @@ static void llama_model_load_internal( static bool llama_model_load( const std::string & fname, llama_model & model, - llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1782,14 +1786,28 @@ static bool llama_model_load( std::string arch_name; GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture"); - const llm_arch arch = llama_arch_from_string(arch_name); + const llm_arch arch = llm_arch_from_string(arch_name); if (arch == LLM_ARCH_UNKNOWN) { throw std::runtime_error("unknown model architecture: '" + arch_name + "'"); } - llama_model_load_internal(*ml, model, vocab, n_ctx, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); + llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale); + llm_load_vocab(*ml, model); + + llm_load_print_meta(*ml, model); + + if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { + throw std::runtime_error("vocab size mismatch"); + } + + if (vocab_only) { + LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); + return true; + } + + llama_model_load_internal(*ml, model, n_batch, n_gpu_layers, + main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, + use_mlock, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); @@ -4191,7 +4209,7 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {