llama : refactor llama_model_load_internal()

2023-08-22 19:30:02 +03:00 · 2023-08-22 19:30:02 +03:00 · 3057d6a687
commit 3057d6a687
parent 8bd7f06b58
1 changed files with 189 additions and 171 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -208,7 +208,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES_BA
    },
 };
-static llm_arch llama_arch_from_string(const std::string & name) {
+static llm_arch llm_arch_from_string(const std::string & name) {
    for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
        if (kv.second == name) {
            return kv.first;
@ -836,6 +836,9 @@ struct llama_model {
    e_model     type  = MODEL_UNKNOWN;
    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
    std::string name = "n/a";
    std::string arch = "n/a";
    llama_hparams hparams;
    llama_vocab   vocab;
@ -1358,38 +1361,34 @@ static const char * llama_model_type_name(e_model type) {
    }
 }
-static void llama_model_load_internal(
+static void llm_load_vocab(
        llama_model_loader & ml,
-        llama_model & model,
+        llama_model & model) {
-        llama_vocab & vocab,
+    auto & vocab = model.vocab;
        int n_ctx,
        int n_batch,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
        const bool mul_mat_q,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
        bool vocab_only,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();
    model.n_gpu_layers = n_gpu_layers;
    auto & hparams = model.hparams;
    std::string general_name = "n/a";
    std::string general_arch = "n/a";
    // read hparams
    {
    struct gguf_context * ctx = ml.ctx_gguf;
    const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
    if (token_idx == -1) {
        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
    }
    const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
    if (score_idx == -1) {
        throw std::runtime_error("cannot find tokenizer scores in model file\n");
    }
    const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
    if (toktype_idx == -1) {
        throw std::runtime_error("cannot find token type list in GGUF file\n");
    }
    const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
    // determine vocab type
    {
        std::string tokenizer_name;
        GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
@ -1402,6 +1401,45 @@ static void llama_model_load_internal(
            LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
            vocab.type = LLAMA_VOCAB_TYPE_SPM;
        }
    }
    const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
    vocab.id_to_token.resize(n_vocab);
    for (uint32_t i = 0; i < n_vocab; i++) {
        std::string word = gguf_get_arr_str(ctx, token_idx, i);
        vocab.token_to_id[word] = i;
        auto & token_data = vocab.id_to_token[i];
        token_data.text  = std::move(word);
        token_data.score = scores[i];
        token_data.type  = (llama_token_type) toktypes[i];
        // determine the newline token: 0x0A == 10 == '\n'
        if (token_data.text == "<0x0A>") {
            vocab.linefeed_id = i;
        }
    }
    // special tokens
    GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
    GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
    GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
    GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
    GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
 }
 static void llm_load_hparams(
        llama_model_loader & ml,
        llama_model & model,
        int n_ctx,
        float rope_freq_base,
        float rope_freq_scale) {
    auto & hparams = model.hparams;
    struct gguf_context * ctx = ml.ctx_gguf;
    // get hparams kv
    GGUF_GET_KEY(ctx, hparams.n_vocab,        gguf_get_arr_n,   GGUF_TYPE_ARRAY,   true, "tokenizer.ggml.tokens");
@ -1426,15 +1464,8 @@ static void llama_model_load_internal(
    }
    // get general kv
-        GGUF_GET_KEY(ctx, general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
+    GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
-        GGUF_GET_KEY(ctx, general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
+    GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
        // special tokens
        GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
        GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
        GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
        GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
        GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
    switch (hparams.n_layer) {
        case 26: model.type = e_model::MODEL_3B; break;
@ -1469,52 +1500,13 @@ static void llama_model_load_internal(
    hparams.rope_freq_scale = rope_freq_scale;
 }
-    // read vocab
+static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
-    {
+    const auto & hparams = model.hparams;
-        struct gguf_context * ctx = ml.ctx_gguf;
+    const auto & vocab   = model.vocab;
        vocab.id_to_token.resize(hparams.n_vocab);
        const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
        if (token_idx == -1) {
            throw std::runtime_error("cannot find tokenizer vocab in model file\n");
        }
        const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
        if (score_idx == -1) {
            throw std::runtime_error("cannot find tokenizer scores in model file\n");
        }
        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
        const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
        if (toktype_idx == -1) {
            throw std::runtime_error("cannot find token type list in GGUF file\n");
        }
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
        for (uint32_t i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ctx, token_idx, i);
            vocab.token_to_id[word] = i;
            auto & token_data = vocab.id_to_token[i];
            token_data.text  = std::move(word);
            token_data.score = scores[i];
            token_data.type  = (llama_token_type) toktypes[i];
            // determine the newline token: 0x0A == 10 == '\n'
            if (token_data.text == "<0x0A>") {
                vocab.linefeed_id = i;
            }
        }
    }
    {
    // hparams
    LLAMA_LOG_INFO("%s: format       = %s\n",     __func__, llama_file_version_name(ml.fver));
-        LLAMA_LOG_INFO("%s: arch         = %s\n",     __func__, general_arch.c_str());
+    LLAMA_LOG_INFO("%s: arch         = %s\n",     __func__, model.arch.c_str());
    LLAMA_LOG_INFO("%s: vocab type   = %s\n",     __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
    LLAMA_LOG_INFO("%s: n_vocab      = %u\n",     __func__, hparams.n_vocab);
    LLAMA_LOG_INFO("%s: n_ctx_train  = %u\n",     __func__, hparams.n_ctx_train);
@ -1534,7 +1526,7 @@ static void llama_model_load_internal(
    LLAMA_LOG_INFO("%s: model size   = %.2f B\n", __func__, ml.n_elements*1e-9);
    // general kv
-        LLAMA_LOG_INFO("%s: general.name = %s\n",    __func__, general_name.c_str());
+    LLAMA_LOG_INFO("%s: general.name = %s\n",    __func__, model.name.c_str());
    // special tokens
    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
@ -1545,12 +1537,25 @@ static void llama_model_load_internal(
    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
-    if (vocab_only) {
+static void llama_model_load_internal(
-        LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+        llama_model_loader & ml,
-        return;
+        llama_model & model,
-    }
+        int n_batch,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
        const bool mul_mat_q,
        bool low_vram,
        ggml_type memory_type,
        bool use_mlock,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();
    auto & ctx     = model.ctx;
    auto & hparams = model.hparams;
    model.n_gpu_layers = n_gpu_layers;
    size_t ctx_size;
    size_t mmapped_size;
@ -1760,7 +1765,6 @@ static void llama_model_load_internal(
 static bool llama_model_load(
        const std::string & fname,
        llama_model & model,
        llama_vocab & vocab,
        int n_ctx,
        int n_batch,
        int n_gpu_layers,
@ -1782,14 +1786,28 @@ static bool llama_model_load(
        std::string arch_name;
        GGUF_GET_KEY(ml->ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "general.architecture");
-        const llm_arch arch = llama_arch_from_string(arch_name);
+        const llm_arch arch = llm_arch_from_string(arch_name);
        if (arch == LLM_ARCH_UNKNOWN) {
            throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
        }
-        llama_model_load_internal(*ml, model, vocab, n_ctx, n_batch, n_gpu_layers,
+        llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
-                                  main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
+        llm_load_vocab(*ml, model);
-                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
+
        llm_load_print_meta(*ml, model);
        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
            throw std::runtime_error("vocab size mismatch");
        }
        if (vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
            return true;
        }
        llama_model_load_internal(*ml, model, n_batch, n_gpu_layers,
                                  main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
                                  use_mlock, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
@ -4191,7 +4209,7 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
                params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
                params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
                params.progress_callback, params.progress_callback_user_data)) {