From 1892ae7eb1844f6704c0dd2ec0a4fe9508b77eb1 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 19:11:37 +0100 Subject: [PATCH] llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts --- examples/gguf-split/gguf-split.cpp | 18 +- llama.cpp | 259 +++++++++++++---------------- 2 files changed, 125 insertions(+), 152 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index e45151ab1..3f582506d 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -26,9 +26,9 @@ enum split_operation : uint8_t { SPLIT_OP_MERGE, }; -static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no"; -static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count"; -static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count"; +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; struct split_params { split_operation operation = SPLIT_OP_SPLIT; @@ -177,9 +177,9 @@ struct split_strategy { if (i_split == 0) { gguf_set_kv(ctx_out, ctx_gguf); } - gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); - gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); - gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split); + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); // populate the original tensors, so we get an initial metadata for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { @@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) { ctx_metas.push_back(ctx_meta); if (i_split == 0) { - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); if (key_n_split < 0) { fprintf(stderr, "\n%s: input file does not contain %s metadata\n", __func__, - LLM_KV_GENERAL_SPLIT_N_SPLIT); + LLM_KV_SPLIT_COUNT); gguf_free(ctx_gguf); ggml_free(ctx_meta); gguf_free(ctx_out); @@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) { } // Do not trigger merge if we try to merge again the output - gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); // Set metadata from the first split gguf_set_kv(ctx_out, ctx_gguf); diff --git a/llama.cpp b/llama.cpp index 168ef4ee5..ecfc905f3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2819,22 +2819,24 @@ struct llama_model_loader { std::vector> mappings; - // Holds information on a tensor data source location. - struct llama_tensor_offset { - uint16_t idx; // source file index - size_t offs; // tensor data offset in the original file + // Holds information on a model weights + struct llama_tensor_weights { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file - llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) { + ggml_tensor * tensor; + + llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { const int tensor_idx = gguf_find_tensor(gguf_ctx, name); offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); } }; - std::unordered_map tensors_offs; // unified tensor data offset across files + std::vector weights; std::unordered_map kv_overrides; - struct gguf_context * ctx_gguf = NULL; - struct ggml_context * ctx_meta = NULL; + struct gguf_context * meta = NULL; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); @@ -2845,128 +2847,91 @@ struct llama_model_loader { trace = atoi(getenv("LLAMA_TRACE")); } - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - if (param_overrides_p != nullptr) { for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { kv_overrides.insert({std::string(p->key), *p}); } } - ctx_gguf = gguf_init_from_file(fname.c_str(), params); - if (!ctx_gguf) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta = gguf_init_from_file(fname.c_str(), params); + if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } - files.emplace_back(new llama_file(fname.c_str(), "rb")); get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); // Save tensors data offset of the main file. - // For subsidiary files, gguf_ctx tensor data offset must not be used, - // we build a unified tensors offset index. - for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) { - tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf)); + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur)); } + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); - // Build virtual GGUF/GGML contexts to represent all tensors across files + // Load additional GGML contexts if (n_split > 1) { uint16_t idx = 0; get_key(llm_kv(LLM_KV_SPLIT_NO), idx); if (idx != 0) { throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); } - get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); char split_prefix[PATH_MAX] = {0}; if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) { throw std::runtime_error(format("invalid split file: %s", fname.c_str())); } - size_t mem_size = n_tensors*ggml_tensor_overhead(); - struct ggml_init_params pdata = { - /*.mem_size = */ mem_size, - /*.mem_buffer = */ NULL, - /*.no_alloc = */ true, - }; - - auto * new_ctx_meta = ggml_init(pdata); - if (trace > 0) { LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); } - for (; idx < n_split; idx++) { - char split_path[PATH_MAX] = {0}; - struct ggml_context * split_ctx_meta = NULL; - struct gguf_context * split_ctx_gguf = NULL; - if (idx == 0) { - split_ctx_gguf = ctx_gguf; - split_ctx_meta = ctx_meta; - strcpy(split_path, fname.c_str()); - } else { - llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); - struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &split_ctx_meta, - }; - split_ctx_gguf = gguf_init_from_file(split_path, split_params); - if (!split_ctx_gguf) { - throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str())); - } + char split_path[PATH_MAX] = {0}; + for (idx = 1; idx < n_split; idx++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } - bool ok = true; - for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) { - struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne); - ok = ok && copy != NULL; - - if (!ok) { - break; - } - - ggml_set_name(copy, tensor->name); - - // Add the tensor to the main gguf context if not already present - if (idx > 0) { - gguf_add_tensor(ctx_gguf, copy); - tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf)); - } + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur)); } + files.emplace_back(new llama_file(split_path, "rb")); + contexts.emplace_back(ctx); - if (!ok) { - throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__)); - } - - if (idx > 0) { - files.emplace_back(new llama_file(split_path, "rb")); - gguf_free(split_ctx_gguf); - ggml_free(split_ctx_meta); - } + gguf_free(ctx_gguf); } - - ggml_free(ctx_meta); - ctx_meta = new_ctx_meta; + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + GGML_ASSERT(n_tensors == (int) weights.size()); LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); } - n_kv = gguf_get_n_kv(ctx_gguf); - n_tensors = gguf_get_n_tensors(ctx_gguf); + n_kv = gguf_get_n_kv(meta); + n_tensors = weights.size(); - fver = (enum llama_fver ) gguf_get_version(ctx_gguf); + fver = (enum llama_fver ) gguf_get_version(meta); - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); - n_elements += ggml_nelements(t); - n_bytes += ggml_nbytes(t); + for (auto & w : weights) { + n_elements += ggml_nelements(w.tensor); + n_bytes += ggml_nbytes(w.tensor); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -2981,7 +2946,8 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); + const ggml_tensor * tensor = weights.at(i).tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -2991,8 +2957,7 @@ struct llama_model_loader { } if (trace > 0) { - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } @@ -3028,22 +2993,22 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + const int kid = gguf_find_key(meta, "general.file_type"); if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta, kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx_gguf, i); - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + const char * name = gguf_get_key(meta, i); + const enum gguf_type type = gguf_get_kv_type(meta, i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(ctx_gguf, i); + std::string value = gguf_kv_to_str(meta, i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -3072,18 +3037,18 @@ struct llama_model_loader { } ~llama_model_loader() { - if (ctx_gguf) { - gguf_free(ctx_gguf); + if (meta) { + gguf_free(meta); } - if (ctx_meta) { - ggml_free(ctx_meta); + for (auto & ctx : contexts) { + ggml_free(ctx); } } template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(ctx_gguf, key.c_str()); + const int kid = gguf_find_key(meta, key.c_str()); if (kid < 0) { if (required) { @@ -3093,7 +3058,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(ctx_gguf, kid); + GGUFMeta::GKV::get_kv(meta, kid); result = arr_info.length; @@ -3113,7 +3078,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + const bool found = GGUFMeta::GKV::set(meta, key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -3136,20 +3101,29 @@ struct llama_model_loader { } const char * get_tensor_name(int i) const { - return gguf_get_tensor_name(ctx_gguf, i); + return weights.at(i).tensor->name; + } + + const llama_tensor_weights & get_weights(const char * name) const { + for (const auto & weight : weights) { + if (strcmp(name, weight.tensor->name) == 0) { + return weight; + } + } + throw std::runtime_error(format("tensor %s not found", name)); } struct ggml_tensor * get_tensor_meta(const char * name) const { - return ggml_get_tensor(ctx_meta, name); + return get_weights(name).tensor; } struct ggml_tensor * get_tensor_meta(int i) const { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - ggml_set_name(tensor, ggml_get_name(meta)); + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); + ggml_set_name(tensor, ggml_get_name(cur)); n_created++; @@ -3157,7 +3131,7 @@ struct llama_model_loader { } struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); if (cur == NULL) { if (!required) { @@ -3207,9 +3181,8 @@ struct llama_model_loader { } // compute the total size of all tensors for progress reporting - for (int i = 0; i < n_tensors; i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); + for (auto & w : weights) { + size_data += ggml_nbytes(w.tensor); } } @@ -3221,28 +3194,28 @@ struct llama_model_loader { *last = 0; *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor)); - *first = std::min(*first, tensor_off.offs); - *last = std::max(*last, tensor_off.offs + ggml_nbytes(tensor)); + const auto & w = get_weights(ggml_get_name(tensor)); + *first = std::min(*first, w.offs); + *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } } // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { - const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); - if (use_mmap && t_offs.idx < mappings.size()) { - const auto & mapping = mappings.at(t_offs.idx); + if (use_mmap && w.idx < mappings.size()) { + const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr + t_offs.offs; + cur->data = (uint8_t *)mapping->addr + w.offs; } else { - memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur)); + memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur)); } } else { GGML_ASSERT(cur->data != nullptr); - GGML_ASSERT(t_offs.idx < files.size()); - const auto & file = files.at(t_offs.idx); - file->seek(t_offs.offs, SEEK_SET); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } } @@ -3263,39 +3236,39 @@ struct llama_model_loader { } } - const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); size_t n_size = ggml_nbytes(cur); - if (use_mmap && t_offs.idx < mappings.size()) { - const auto & mapping = mappings.at(t_offs.idx); + if (use_mmap && w.idx < mappings.size()) { + const auto & mapping = mappings.at(w.idx); ggml_backend_buffer_t buf_mmap = nullptr; if (bufs_mmap.size() > 1) { - buf_mmap = bufs_mmap[t_offs.idx]; + buf_mmap = bufs_mmap[w.idx]; } else if (!bufs_mmap.empty()) { buf_mmap = bufs_mmap.front(); } if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs); + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); if (lmlocks) { - const auto & lmlock = lmlocks->at(t_offs.idx); - lmlock->grow_to(t_offs.offs + ggml_nbytes(cur)); + const auto & lmlock = lmlocks->at(w.idx); + lmlock->grow_to(w.offs + ggml_nbytes(cur)); } - auto & mmap_used = mmaps_used[t_offs.idx]; - mmap_used.first = std::min(mmap_used.first, t_offs.offs); - mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size); + auto & mmap_used = mmaps_used[w.idx]; + mmap_used.first = std::min(mmap_used.first, w.offs); + mmap_used.second = std::max(mmap_used.second, w.offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size); + ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size); } } else { - GGML_ASSERT(t_offs.idx < files.size()); - const auto & file = files.at(t_offs.idx); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file->seek(t_offs.offs, SEEK_SET); + file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); - file->seek(t_offs.offs, SEEK_SET); + file->seek(w.offs, SEEK_SET); file->read_raw(read_buf.data(), ggml_nbytes(cur)); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); } @@ -3447,7 +3420,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.ctx_gguf; + const gguf_context * ctx = ml.meta; // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -3837,7 +3810,7 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.ctx_gguf; + struct gguf_context * ctx = ml.meta; const auto kv = LLM_KV(model.arch); @@ -4447,7 +4420,7 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { + if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) { layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } @@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.ctx_gguf); + gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = ggml_get_name(meta); @@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // populate the original tensors so we get an initial meta data for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } @@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * base_t; if (ml) { - if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + if (!ml->get_tensor_meta(base_name.c_str())) { LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; }