loader: refactor tensor weights storage

2024-10-18 15:31:10 +08:00 · 2024-10-18 15:31:10 +08:00 · f742e3c0d5
commit f742e3c0d5
parent 61408e7fad
1 changed files with 53 additions and 58 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4271,17 +4271,17 @@ struct llama_model_loader {

        ggml_tensor * tensor;

-        llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
-            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
+        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);

            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
-                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
+                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
            }
        }
    };
-    std::vector<llama_tensor_weight> weights;

+    std::unordered_map<std::string, struct llama_tensor_weight> weights_map;
    std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;

    struct gguf_context * meta = NULL;
@ -4323,7 +4323,14 @@ struct llama_model_loader {
        // For subsidiary files, `meta` tensor data offset must not be used,
        // so we build a unified tensors index for weights.
        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-            weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
+            std::string tensor_name = std::string(cur->name);
+            // make sure there is no duplicated tensor names
+            if (weights_map.find(tensor_name) != weights_map.end()) {
+                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+            }
+            n_elements += ggml_nelements(cur);
+            n_bytes    += ggml_nbytes(cur);
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta, cur));
        }
        uint16_t n_split = 0;
        get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
@ -4363,7 +4370,14 @@ struct llama_model_loader {

                // Save tensors data offset info of the shard.
                for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
-                    weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
+                    std::string tensor_name = std::string(cur->name);
+                    // make sure there is no duplicated tensor names
+                    if (weights_map.find(tensor_name) != weights_map.end()) {
+                        throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+                    }
+                    n_elements += ggml_nelements(cur);
+                    n_bytes    += ggml_nbytes(cur);
+                    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf, cur));
                }

                gguf_free(ctx_gguf);
@ -4373,7 +4387,7 @@ struct llama_model_loader {

            // sanity check
            {
-                const int n_tensors_loaded = (int) weights.size();
+                const int n_tensors_loaded = (int) weights_map.size();
                if (n_tensors != n_tensors_loaded) {
                    throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
                }
@ -4383,23 +4397,10 @@ struct llama_model_loader {
        }

        n_kv      = gguf_get_n_kv(meta);
-        n_tensors = weights.size();
+        n_tensors = weights_map.size();

        fver = (enum llama_fver) gguf_get_version(meta);

-        std::set<std::string> tensor_names;
-        for (auto & w : weights) {
-            n_elements += ggml_nelements(w.tensor);
-            n_bytes    += ggml_nbytes(w.tensor);
-            // make sure there is no duplicated tensor names
-            const std::string name(w.tensor->name);
-            auto found = tensor_names.find(name);
-            if (found != tensor_names.end()) {
-                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
-            }
-            tensor_names.insert(name);
-        }
-
        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
                __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));

@ -4411,8 +4412,10 @@ struct llama_model_loader {
            uint32_t n_type_max = 0;
            enum ggml_type type_max = GGML_TYPE_F32;

-            for (int i = 0; i < n_tensors; i++) {
-                const ggml_tensor * tensor = weights.at(i).tensor;
+            for (auto it = weights_map.begin(); it != weights_map.end(); it++) {
+                const llama_tensor_weight & w = it->second;
+                const ggml_tensor * tensor = w.tensor;
+
                enum ggml_type type = tensor->type;

                n_type[type]++;
@ -4423,8 +4426,8 @@ struct llama_model_loader {
                }

                if (trace > 0) {
-                    const uint16_t sid = weights.at(i).idx;
-                    LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                    const uint16_t sid = w.idx;
+                    LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
                }
            }

@ -4688,21 +4691,15 @@ struct llama_model_loader {
        return llm_kv.arch;
    }

-    const char * get_tensor_name(int i) const {
-        return weights.at(i).tensor->name;
-    }
-
    const llama_tensor_weight * get_weight(const char * name) const {
-        for (const auto & weight : weights) {
-            if (strcmp(name, weight.tensor->name) == 0) {
-                return &weight;
-            }
-        }
-        return nullptr;
-    }
+        std::string tensor_name(name);

-    const llama_tensor_weight * get_weight(int i) const {
-        return get_weight(get_tensor_name(i));
+        auto pos = weights_map.find(tensor_name);
+        if (pos != weights_map.end()) {
+            return &pos->second;
+        }
+
+        return nullptr;
    }

    const llama_tensor_weight & require_weight(const char * name) const {
@ -4729,10 +4726,6 @@ struct llama_model_loader {
        return tensor;
    }

-    struct ggml_tensor * get_tensor_meta(int i) const {
-        return get_tensor_meta(get_tensor_name(i));
-    }
-
    const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
        const struct ggml_tensor * cur = get_tensor_meta(name.c_str());

@ -4839,8 +4832,8 @@ struct llama_model_loader {
        }

        // compute the total size of all tensors for progress reporting
-        for (auto & w : weights) {
-            size_data += ggml_nbytes(w.tensor);
+        for (auto it = weights_map.begin(); it != weights_map.end(); it++) {
+            size_data += ggml_nbytes(it->second.tensor);
        }
    }

@ -18595,10 +18588,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        }
    }

-    for (int i = 0; i < ml.n_tensors; ++i) {
-        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
+    for (auto it = ml.weights_map.begin(); it != ml.weights_map.end(); ++it) {
+        const struct ggml_tensor * tensor = it->second.tensor;

-        const std::string name = ggml_get_name(meta);
+        const std::string name = ggml_get_name(tensor);

        // TODO: avoid hardcoded tensor names - use the TN_* constants
        if (name.find("attn_v.weight")   != std::string::npos ||
@ -18636,20 +18629,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::vector<no_init<float>> f32_conv_buf;

    uint16_t n_split = 1;
+    const auto & weights_map = ml.weights_map;
+
    // Assume split index is continuous
    if (params->keep_split) {
-        for (int i = 0; i < ml.n_tensors; ++i) {
-            n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
+        for (auto it = weights_map.begin(); it != weights_map.end(); ++it) {
+            n_split = std::max(uint16_t(it->second.idx+1), n_split);
        }
+
    }
    std::vector<gguf_context*> ctx_outs(n_split, NULL);
    ctx_outs[0] = ctx_out;

    // populate the original tensors so we get an initial meta data
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        auto weight = ml.get_weight(i);
-        uint16_t i_split = params->keep_split ? weight->idx : 0;
-        struct ggml_tensor * tensor = weight->tensor;
+    for (auto it = weights_map.begin(); it != weights_map.end(); ++it) {
+        uint16_t i_split = params->keep_split ? it->second.idx : 0;
+        struct ggml_tensor * tensor = it->second.tensor;
        if (ctx_outs[i_split] == NULL) {
            ctx_outs[i_split] = gguf_init_empty();
        }
@ -18696,12 +18691,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

    const auto tn = LLM_TN(model.arch);
    new_ofstream(0);
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        auto weight = ml.get_weight(i);
-        struct ggml_tensor * tensor = weight->tensor;
-        if (weight->idx != cur_split && params->keep_split) {
+    for (auto it = weights_map.begin(); it != weights_map.end(); ++it) {
+        auto weight = it->second;
+        struct ggml_tensor * tensor = weight.tensor;
+        if (weight.idx != cur_split && params->keep_split) {
            close_ofstream();
-            new_ofstream(weight->idx);
+            new_ofstream(weight.idx);
        }

        const std::string name = ggml_get_name(tensor);