llama_model_loader: PR feedbacks:

- use only one gguf_context for metadata only
 - store all ggml_context in a vector as the files and mappings
 - store all weights in a vector along with the source tensor
 - rename ctx_gguf to meta
 - rename ctx_meta to contexts
This commit is contained in:
Pierrick HYMBERT 2024-03-21 19:11:37 +01:00
parent 60a87ae051
commit 1892ae7eb1
2 changed files with 125 additions and 152 deletions

View file

@ -26,9 +26,9 @@ enum split_operation : uint8_t {
SPLIT_OP_MERGE,
};
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no";
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count";
static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count";
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
@ -177,9 +177,9 @@ struct split_strategy {
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
// populate the original tensors, so we get an initial metadata
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) {
ctx_metas.push_back(ctx_meta);
if (i_split == 0) {
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split < 0) {
fprintf(stderr,
"\n%s: input file does not contain %s metadata\n",
__func__,
LLM_KV_GENERAL_SPLIT_N_SPLIT);
LLM_KV_SPLIT_COUNT);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) {
}
// Do not trigger merge if we try to merge again the output
gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);

245
llama.cpp
View file

@ -2819,22 +2819,24 @@ struct llama_model_loader {
std::vector<std::unique_ptr<llama_mmap>> mappings;
// Holds information on a tensor data source location.
struct llama_tensor_offset {
// Holds information on a model weights
struct llama_tensor_weights {
uint16_t idx; // source file index
size_t offs; // tensor data offset in the original file
llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
ggml_tensor * tensor;
llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
}
};
std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset across files
std::vector<llama_tensor_weights> weights;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
struct gguf_context * ctx_gguf = NULL;
struct ggml_context * ctx_meta = NULL;
struct gguf_context * meta = NULL;
std::vector<ggml_context *> contexts;
std::string arch_name;
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
@ -2845,128 +2847,91 @@ struct llama_model_loader {
trace = atoi(getenv("LLAMA_TRACE"));
}
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
if (param_overrides_p != nullptr) {
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
kv_overrides.insert({std::string(p->key), *p});
}
}
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
meta = gguf_init_from_file(fname.c_str(), params);
if (!meta) {
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
}
files.emplace_back(new llama_file(fname.c_str(), "rb"));
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
// Save tensors data offset of the main file.
// For subsidiary files, gguf_ctx tensor data offset must not be used,
// we build a unified tensors offset index.
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) {
tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf));
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
}
files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);
uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
// Build virtual GGUF/GGML contexts to represent all tensors across files
// Load additional GGML contexts
if (n_split > 1) {
uint16_t idx = 0;
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
if (idx != 0) {
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
}
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
char split_prefix[PATH_MAX] = {0};
if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
}
size_t mem_size = n_tensors*ggml_tensor_overhead();
struct ggml_init_params pdata = {
/*.mem_size = */ mem_size,
/*.mem_buffer = */ NULL,
/*.no_alloc = */ true,
};
auto * new_ctx_meta = ggml_init(pdata);
if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
}
for (; idx < n_split; idx++) {
char split_path[PATH_MAX] = {0};
struct ggml_context * split_ctx_meta = NULL;
struct gguf_context * split_ctx_gguf = NULL;
if (idx == 0) {
split_ctx_gguf = ctx_gguf;
split_ctx_meta = ctx_meta;
strcpy(split_path, fname.c_str());
} else {
for (idx = 1; idx < n_split; idx++) {
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &split_ctx_meta,
/*.ctx = */ &ctx,
};
split_ctx_gguf = gguf_init_from_file(split_path, split_params);
if (!split_ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
}
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
}
bool ok = true;
for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) {
struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne);
ok = ok && copy != NULL;
if (!ok) {
break;
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
}
ggml_set_name(copy, tensor->name);
// Add the tensor to the main gguf context if not already present
if (idx > 0) {
gguf_add_tensor(ctx_gguf, copy);
tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf));
}
}
if (!ok) {
throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__));
}
if (idx > 0) {
files.emplace_back(new llama_file(split_path, "rb"));
gguf_free(split_ctx_gguf);
ggml_free(split_ctx_meta);
}
}
contexts.emplace_back(ctx);
ggml_free(ctx_meta);
ctx_meta = new_ctx_meta;
gguf_free(ctx_gguf);
}
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
GGML_ASSERT(n_tensors == (int) weights.size());
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
}
n_kv = gguf_get_n_kv(ctx_gguf);
n_tensors = gguf_get_n_tensors(ctx_gguf);
n_kv = gguf_get_n_kv(meta);
n_tensors = weights.size();
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
fver = (enum llama_fver ) gguf_get_version(meta);
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
n_elements += ggml_nelements(t);
n_bytes += ggml_nbytes(t);
for (auto & w : weights) {
n_elements += ggml_nelements(w.tensor);
n_bytes += ggml_nbytes(w.tensor);
}
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@ -2981,7 +2946,8 @@ struct llama_model_loader {
enum ggml_type type_max = GGML_TYPE_F32;
for (int i = 0; i < n_tensors; i++) {
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
const ggml_tensor * tensor = weights.at(i).tensor;
enum ggml_type type = tensor->type;
n_type[type]++;
@ -2991,8 +2957,7 @@ struct llama_model_loader {
}
if (trace > 0) {
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
}
}
@ -3028,22 +2993,22 @@ struct llama_model_loader {
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
{
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
const int kid = gguf_find_key(meta, "general.file_type");
if (kid >= 0) {
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
}
}
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
const char * name = gguf_get_key(meta, i);
const enum gguf_type type = gguf_get_kv_type(meta, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(ctx_gguf, i);
std::string value = gguf_kv_to_str(meta, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@ -3072,18 +3037,18 @@ struct llama_model_loader {
}
~llama_model_loader() {
if (ctx_gguf) {
gguf_free(ctx_gguf);
if (meta) {
gguf_free(meta);
}
if (ctx_meta) {
ggml_free(ctx_meta);
for (auto & ctx : contexts) {
ggml_free(ctx);
}
}
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
get_arr_n(const std::string & key, T & result, const bool required = true) {
const int kid = gguf_find_key(ctx_gguf, key.c_str());
const int kid = gguf_find_key(meta, key.c_str());
if (kid < 0) {
if (required) {
@ -3093,7 +3058,7 @@ struct llama_model_loader {
}
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
result = arr_info.length;
@ -3113,7 +3078,7 @@ struct llama_model_loader {
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@ -3136,20 +3101,29 @@ struct llama_model_loader {
}
const char * get_tensor_name(int i) const {
return gguf_get_tensor_name(ctx_gguf, i);
return weights.at(i).tensor->name;
}
const llama_tensor_weights & get_weights(const char * name) const {
for (const auto & weight : weights) {
if (strcmp(name, weight.tensor->name) == 0) {
return weight;
}
}
throw std::runtime_error(format("tensor %s not found", name));
}
struct ggml_tensor * get_tensor_meta(const char * name) const {
return ggml_get_tensor(ctx_meta, name);
return get_weights(name).tensor;
}
struct ggml_tensor * get_tensor_meta(int i) const {
return get_tensor_meta(get_tensor_name(i));
}
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
ggml_set_name(tensor, ggml_get_name(meta));
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur));
n_created++;
@ -3157,7 +3131,7 @@ struct llama_model_loader {
}
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
if (cur == NULL) {
if (!required) {
@ -3207,9 +3181,8 @@ struct llama_model_loader {
}
// compute the total size of all tensors for progress reporting
for (int i = 0; i < n_tensors; i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
size_data += ggml_nbytes(cur);
for (auto & w : weights) {
size_data += ggml_nbytes(w.tensor);
}
}
@ -3221,28 +3194,28 @@ struct llama_model_loader {
*last = 0;
*addr = mapping->addr;
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor));
*first = std::min(*first, tensor_off.offs);
*last = std::max(*last, tensor_off.offs + ggml_nbytes(tensor));
const auto & w = get_weights(ggml_get_name(tensor));
*first = std::min(*first, w.offs);
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
}
}
// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const {
const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
const auto & w = get_weights(ggml_get_name(cur));
if (use_mmap && t_offs.idx < mappings.size()) {
const auto & mapping = mappings.at(t_offs.idx);
if (use_mmap && w.idx < mappings.size()) {
const auto & mapping = mappings.at(w.idx);
if (cur->data == nullptr) {
cur->data = (uint8_t *)mapping->addr + t_offs.offs;
cur->data = (uint8_t *)mapping->addr + w.offs;
} else {
memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur));
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
}
} else {
GGML_ASSERT(cur->data != nullptr);
GGML_ASSERT(t_offs.idx < files.size());
const auto & file = files.at(t_offs.idx);
file->seek(t_offs.offs, SEEK_SET);
GGML_ASSERT(w.idx < files.size());
const auto & file = files.at(w.idx);
file->seek(w.offs, SEEK_SET);
file->read_raw(cur->data, ggml_nbytes(cur));
}
}
@ -3263,39 +3236,39 @@ struct llama_model_loader {
}
}
const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
const auto & w = get_weights(ggml_get_name(cur));
size_t n_size = ggml_nbytes(cur);
if (use_mmap && t_offs.idx < mappings.size()) {
const auto & mapping = mappings.at(t_offs.idx);
if (use_mmap && w.idx < mappings.size()) {
const auto & mapping = mappings.at(w.idx);
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.size() > 1) {
buf_mmap = bufs_mmap[t_offs.idx];
buf_mmap = bufs_mmap[w.idx];
} else if (!bufs_mmap.empty()) {
buf_mmap = bufs_mmap.front();
}
if (buf_mmap && cur->data == nullptr) {
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs);
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
if (lmlocks) {
const auto & lmlock = lmlocks->at(t_offs.idx);
lmlock->grow_to(t_offs.offs + ggml_nbytes(cur));
const auto & lmlock = lmlocks->at(w.idx);
lmlock->grow_to(w.offs + ggml_nbytes(cur));
}
auto & mmap_used = mmaps_used[t_offs.idx];
mmap_used.first = std::min(mmap_used.first, t_offs.offs);
mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size);
auto & mmap_used = mmaps_used[w.idx];
mmap_used.first = std::min(mmap_used.first, w.offs);
mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
} else {
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size);
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size);
}
} else {
GGML_ASSERT(t_offs.idx < files.size());
const auto & file = files.at(t_offs.idx);
GGML_ASSERT(w.idx < files.size());
const auto & file = files.at(w.idx);
if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(t_offs.offs, SEEK_SET);
file->seek(w.offs, SEEK_SET);
file->read_raw(cur->data, ggml_nbytes(cur));
} else {
read_buf.resize(ggml_nbytes(cur));
file->seek(t_offs.offs, SEEK_SET);
file->seek(w.offs, SEEK_SET);
file->read_raw(read_buf.data(), ggml_nbytes(cur));
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
}
@ -3447,7 +3420,7 @@ static void llm_load_hparams(
llama_model_loader & ml,
llama_model & model) {
auto & hparams = model.hparams;
const gguf_context * ctx = ml.ctx_gguf;
const gguf_context * ctx = ml.meta;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@ -3837,7 +3810,7 @@ static void llm_load_vocab(
llama_model & model) {
auto & vocab = model.vocab;
struct gguf_context * ctx = ml.ctx_gguf;
struct gguf_context * ctx = ml.meta;
const auto kv = LLM_KV(model.arch);
@ -4447,7 +4420,7 @@ static bool llm_load_tensors(
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) {
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
}
@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file
gguf_set_kv (ctx_out, ml.ctx_gguf);
gguf_set_kv (ctx_out, ml.meta);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * meta = ml.get_tensor_meta(i);
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
const std::string name = ggml_get_name(meta);
@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// populate the original tensors so we get an initial meta data
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * meta = ml.get_tensor_meta(i);
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
gguf_add_tensor(ctx_out, meta);
}
@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal(
ggml_tensor * base_t;
if (ml) {
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
if (!ml->get_tensor_meta(base_name.c_str())) {
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}