From 1892ae7eb1844f6704c0dd2ec0a4fe9508b77eb1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 19:11:37 +0100
Subject: [PATCH] llama_model_loader: PR feedbacks:  - use only one
 gguf_context for metadata only  - store all ggml_context in a vector as the
 files and mappings  - store all weights in a vector along with the source
 tensor  - rename ctx_gguf to meta  - rename ctx_meta to contexts

---
 examples/gguf-split/gguf-split.cpp |  18 +-
 llama.cpp                          | 259 +++++++++++++----------------
 2 files changed, 125 insertions(+), 152 deletions(-)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index e45151ab1..3f582506d 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -26,9 +26,9 @@ enum split_operation : uint8_t {
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT   = "split.no";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT   = "split.count";
-static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count";
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
@@ -177,9 +177,9 @@ struct split_strategy {
         if (i_split == 0) {
             gguf_set_kv(ctx_out, ctx_gguf);
         }
-        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT,  i_split);
-        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT,  n_split);
-        gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
 
         // populate the original tensors, so we get an initial metadata
         for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) {
         ctx_metas.push_back(ctx_meta);
 
         if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
             if (key_n_split < 0) {
                 fprintf(stderr,
                         "\n%s: input file does not contain %s metadata\n",
                         __func__,
-                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
+                        LLM_KV_SPLIT_COUNT);
                 gguf_free(ctx_gguf);
                 ggml_free(ctx_meta);
                 gguf_free(ctx_out);
@@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) {
             }
 
             // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
 
             // Set metadata from the first split
             gguf_set_kv(ctx_out, ctx_gguf);
diff --git a/llama.cpp b/llama.cpp
index 168ef4ee5..ecfc905f3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2819,22 +2819,24 @@ struct llama_model_loader {
 
     std::vector<std::unique_ptr<llama_mmap>> mappings;
 
-    // Holds information on a tensor data source location.
-    struct llama_tensor_offset  {
-        uint16_t  idx;  // source file index
-        size_t    offs; // tensor data offset in the original file
+    // Holds information on a model weights
+    struct llama_tensor_weights  {
+        uint16_t  idx; // source file index
+        size_t   offs; // tensor data offset in the original file
 
-        llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
+        ggml_tensor * tensor;
+
+        llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
             const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
             offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
         }
     };
-    std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset across files
+    std::vector<llama_tensor_weights> weights;
 
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
-    struct gguf_context * ctx_gguf = NULL;
-    struct ggml_context * ctx_meta = NULL;
+    struct gguf_context * meta = NULL;
+    std::vector<ggml_context *> contexts;
 
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
@@ -2845,128 +2847,91 @@ struct llama_model_loader {
             trace = atoi(getenv("LLAMA_TRACE"));
         }
 
-        struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
-        };
-
         if (param_overrides_p != nullptr) {
             for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
                 kv_overrides.insert({std::string(p->key), *p});
             }
         }
 
-        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-        if (!ctx_gguf) {
+        struct ggml_context * ctx = NULL;
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx,
+        };
+
+        meta = gguf_init_from_file(fname.c_str(), params);
+        if (!meta) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
-        files.emplace_back(new llama_file(fname.c_str(), "rb"));
 
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
         // Save tensors data offset of the main file.
-        // For subsidiary files, gguf_ctx tensor data offset must not be used,
-        // we build a unified tensors offset index.
-        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) {
-            tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf));
+        // For subsidiary files, `meta` tensor data offset must not be used,
+        // so we build a unified tensors index for weights.
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+            weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
         }
+        files.emplace_back(new llama_file(fname.c_str(), "rb"));
+        contexts.emplace_back(ctx);
 
         uint16_t n_split = 0;
         get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
 
-        // Build virtual GGUF/GGML contexts to represent all tensors across files
+        // Load additional GGML contexts
         if (n_split > 1) {
             uint16_t idx = 0;
             get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
             if (idx != 0) {
                 throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
             }
-            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
 
             char split_prefix[PATH_MAX] = {0};
             if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
                 throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
             }
 
-            size_t mem_size = n_tensors*ggml_tensor_overhead();
-            struct ggml_init_params pdata = {
-                /*.mem_size   = */ mem_size,
-                /*.mem_buffer = */ NULL,
-                /*.no_alloc   = */ true,
-            };
-
-            auto * new_ctx_meta = ggml_init(pdata);
-
             if (trace > 0) {
                 LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
             }
 
-            for (; idx < n_split; idx++) {
-                char split_path[PATH_MAX] = {0};
-                struct ggml_context * split_ctx_meta = NULL;
-                struct gguf_context * split_ctx_gguf = NULL;
-                if (idx == 0) {
-                    split_ctx_gguf = ctx_gguf;
-                    split_ctx_meta = ctx_meta;
-                    strcpy(split_path, fname.c_str());
-                } else {
-                    llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-                    struct gguf_init_params split_params = {
-                        /*.no_alloc = */ true,
-                        /*.ctx      = */ &split_ctx_meta,
-                    };
-                    split_ctx_gguf = gguf_init_from_file(split_path, split_params);
-                    if (!split_ctx_gguf) {
-                        throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
-                    }
+            char split_path[PATH_MAX] = {0};
+            for (idx = 1; idx < n_split; idx++) {
+                llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+                struct gguf_init_params split_params = {
+                    /*.no_alloc = */ true,
+                    /*.ctx      = */ &ctx,
+                };
+                struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
+                if (!ctx_gguf) {
+                    throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
                 }
 
-                bool ok = true;
-                for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) {
-                    struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne);
-                    ok = ok && copy != NULL;
-
-                    if (!ok) {
-                        break;
-                    }
-
-                    ggml_set_name(copy, tensor->name);
-
-                    // Add the tensor to the main gguf context if not already present
-                    if (idx > 0) {
-                        gguf_add_tensor(ctx_gguf, copy);
-                        tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf));
-                    }
+                // Save tensors data offset info of the shard.
+                for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+                    weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
                 }
+                files.emplace_back(new llama_file(split_path, "rb"));
+                contexts.emplace_back(ctx);
 
-                if (!ok) {
-                    throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__));
-                }
-
-                if (idx > 0) {
-                    files.emplace_back(new llama_file(split_path, "rb"));
-                    gguf_free(split_ctx_gguf);
-                    ggml_free(split_ctx_meta);
-                }
+                gguf_free(ctx_gguf);
             }
-
-            ggml_free(ctx_meta);
-            ctx_meta = new_ctx_meta;
+            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+            GGML_ASSERT(n_tensors == (int) weights.size());
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
         }
 
-        n_kv      = gguf_get_n_kv(ctx_gguf);
-        n_tensors = gguf_get_n_tensors(ctx_gguf);
+        n_kv      = gguf_get_n_kv(meta);
+        n_tensors = weights.size();
 
-        fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
+        fver = (enum llama_fver ) gguf_get_version(meta);
 
-        for (int i = 0; i < n_tensors; i++) {
-            const char * name = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
-            n_elements += ggml_nelements(t);
-            n_bytes    += ggml_nbytes(t);
+        for (auto & w : weights) {
+            n_elements += ggml_nelements(w.tensor);
+            n_bytes    += ggml_nbytes(w.tensor);
         }
 
         LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2981,7 +2946,8 @@ struct llama_model_loader {
             enum ggml_type type_max = GGML_TYPE_F32;
 
             for (int i = 0; i < n_tensors; i++) {
-                enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
+                const ggml_tensor * tensor = weights.at(i).tensor;
+                enum ggml_type type = tensor->type;
 
                 n_type[type]++;
 
@@ -2991,8 +2957,7 @@ struct llama_model_loader {
                 }
 
                 if (trace > 0) {
-                    struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
+                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
                 }
             }
 
@@ -3028,22 +2993,22 @@ struct llama_model_loader {
             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
             {
-                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
+                const int kid = gguf_find_key(meta, "general.file_type");
                 if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
+                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
                 }
             }
 
             LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
             for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(ctx_gguf, i);
-                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
+                const char * name           = gguf_get_key(meta, i);
+                const enum gguf_type type   = gguf_get_kv_type(meta, i);
                 const std::string type_name =
                     type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
                     : gguf_type_name(type);
 
-                std::string value          = gguf_kv_to_str(ctx_gguf, i);
+                std::string value          = gguf_kv_to_str(meta, i);
                 const size_t MAX_VALUE_LEN = 40;
                 if (value.size() > MAX_VALUE_LEN) {
                     value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -3072,18 +3037,18 @@ struct llama_model_loader {
     }
 
     ~llama_model_loader() {
-        if (ctx_gguf) {
-            gguf_free(ctx_gguf);
+        if (meta) {
+            gguf_free(meta);
         }
-        if (ctx_meta) {
-            ggml_free(ctx_meta);
+        for (auto & ctx : contexts) {
+            ggml_free(ctx);
         }
     }
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
     get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(ctx_gguf, key.c_str());
+        const int kid = gguf_find_key(meta, key.c_str());
 
         if (kid < 0) {
             if (required) {
@@ -3093,7 +3058,7 @@ struct llama_model_loader {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
 
 
         result = arr_info.length;
@@ -3113,7 +3078,7 @@ struct llama_model_loader {
         const struct llama_model_kv_override * override =
             it != kv_overrides.end() ? &it->second : nullptr;
 
-        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
 
         if (required && !found) {
             throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3136,20 +3101,29 @@ struct llama_model_loader {
     }
 
     const char * get_tensor_name(int i) const {
-        return gguf_get_tensor_name(ctx_gguf, i);
+        return weights.at(i).tensor->name;
+    }
+
+    const llama_tensor_weights & get_weights(const char * name) const {
+        for (const auto & weight : weights) {
+            if (strcmp(name, weight.tensor->name) == 0) {
+                return weight;
+            }
+        }
+        throw std::runtime_error(format("tensor %s not found", name));
     }
 
     struct ggml_tensor * get_tensor_meta(const char * name) const {
-        return ggml_get_tensor(ctx_meta, name);
+        return get_weights(name).tensor;
     }
 
     struct ggml_tensor * get_tensor_meta(int i) const {
         return get_tensor_meta(get_tensor_name(i));
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
-        ggml_set_name(tensor, ggml_get_name(meta));
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
+        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
+        ggml_set_name(tensor, ggml_get_name(cur));
 
         n_created++;
 
@@ -3157,7 +3131,7 @@ struct llama_model_loader {
     }
 
     struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
+        const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
 
         if (cur == NULL) {
             if (!required) {
@@ -3207,9 +3181,8 @@ struct llama_model_loader {
         }
 
         // compute the total size of all tensors for progress reporting
-        for (int i = 0; i < n_tensors; i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            size_data += ggml_nbytes(cur);
+        for (auto & w : weights) {
+            size_data += ggml_nbytes(w.tensor);
         }
     }
 
@@ -3221,28 +3194,28 @@ struct llama_model_loader {
         *last  = 0;
         *addr = mapping->addr;
         for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor));
-            *first = std::min(*first, tensor_off.offs);
-            *last  = std::max(*last,  tensor_off.offs + ggml_nbytes(tensor));
+            const auto & w = get_weights(ggml_get_name(tensor));
+            *first = std::min(*first, w.offs);
+            *last  = std::max(*last, w.offs + ggml_nbytes(tensor));
         }
     }
 
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const {
-        const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
+        const auto & w = get_weights(ggml_get_name(cur));
 
-        if (use_mmap && t_offs.idx < mappings.size()) {
-            const auto & mapping = mappings.at(t_offs.idx);
+        if (use_mmap && w.idx < mappings.size()) {
+            const auto & mapping = mappings.at(w.idx);
             if (cur->data == nullptr) {
-                cur->data = (uint8_t *)mapping->addr + t_offs.offs;
+                cur->data = (uint8_t *)mapping->addr + w.offs;
             } else {
-                memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur));
+                memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
             }
         } else {
             GGML_ASSERT(cur->data != nullptr);
-            GGML_ASSERT(t_offs.idx < files.size());
-            const auto & file = files.at(t_offs.idx);
-            file->seek(t_offs.offs, SEEK_SET);
+            GGML_ASSERT(w.idx < files.size());
+            const auto & file = files.at(w.idx);
+            file->seek(w.offs, SEEK_SET);
             file->read_raw(cur->data, ggml_nbytes(cur));
         }
     }
@@ -3263,39 +3236,39 @@ struct llama_model_loader {
                 }
             }
 
-            const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
+            const auto & w = get_weights(ggml_get_name(cur));
             size_t n_size = ggml_nbytes(cur);
 
-            if (use_mmap && t_offs.idx < mappings.size()) {
-                const auto & mapping = mappings.at(t_offs.idx);
+            if (use_mmap && w.idx < mappings.size()) {
+                const auto & mapping = mappings.at(w.idx);
                 ggml_backend_buffer_t buf_mmap = nullptr;
                 if (bufs_mmap.size() > 1) {
-                    buf_mmap = bufs_mmap[t_offs.idx];
+                    buf_mmap = bufs_mmap[w.idx];
                 } else if (!bufs_mmap.empty()) {
                     buf_mmap = bufs_mmap.front();
                 }
                 if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs);
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
                     if (lmlocks) {
-                        const auto & lmlock = lmlocks->at(t_offs.idx);
-                        lmlock->grow_to(t_offs.offs + ggml_nbytes(cur));
+                        const auto & lmlock = lmlocks->at(w.idx);
+                        lmlock->grow_to(w.offs + ggml_nbytes(cur));
                     }
 
-                    auto & mmap_used = mmaps_used[t_offs.idx];
-                    mmap_used.first = std::min(mmap_used.first, t_offs.offs);
-                    mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size);
+                    auto & mmap_used = mmaps_used[w.idx];
+                    mmap_used.first = std::min(mmap_used.first, w.offs);
+                    mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
                 } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size);
+                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size);
                 }
             } else {
-                GGML_ASSERT(t_offs.idx < files.size());
-                const auto & file = files.at(t_offs.idx);
+                GGML_ASSERT(w.idx < files.size());
+                const auto & file = files.at(w.idx);
                 if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file->seek(t_offs.offs, SEEK_SET);
+                    file->seek(w.offs, SEEK_SET);
                     file->read_raw(cur->data, ggml_nbytes(cur));
                 } else {
                     read_buf.resize(ggml_nbytes(cur));
-                    file->seek(t_offs.offs, SEEK_SET);
+                    file->seek(w.offs, SEEK_SET);
                     file->read_raw(read_buf.data(), ggml_nbytes(cur));
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                 }
@@ -3447,7 +3420,7 @@ static void llm_load_hparams(
         llama_model_loader & ml,
         llama_model & model) {
     auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.ctx_gguf;
+    const gguf_context * ctx = ml.meta;
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3837,7 +3810,7 @@ static void llm_load_vocab(
         llama_model & model) {
     auto & vocab = model.vocab;
 
-    struct gguf_context * ctx = ml.ctx_gguf;
+    struct gguf_context * ctx = ml.meta;
 
     const auto kv = LLM_KV(model.arch);
 
@@ -4447,7 +4420,7 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
+                        if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) {
                             layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
                             layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
                         }
@@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     struct gguf_context * ctx_out = gguf_init_empty();
 
     // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.ctx_gguf);
+    gguf_set_kv     (ctx_out, ml.meta);
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
     gguf_set_val_u32(ctx_out, "general.file_type", ftype);
 
     for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
 
         const std::string name = ggml_get_name(meta);
 
@@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     // populate the original tensors so we get an initial meta data
     for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
         gguf_add_tensor(ctx_out, meta);
     }
 
@@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal(
 
         ggml_tensor * base_t;
         if (ml) {
-            if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
+            if (!ml->get_tensor_meta(base_name.c_str())) {
                 LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                 return 1;
             }