llama_model_loader: PR feedbacks:
- use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts
This commit is contained in:
parent
60a87ae051
commit
1892ae7eb1
2 changed files with 125 additions and 152 deletions
|
@ -26,9 +26,9 @@ enum split_operation : uint8_t {
|
||||||
SPLIT_OP_MERGE,
|
SPLIT_OP_MERGE,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no";
|
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||||
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count";
|
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||||
static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count";
|
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||||
|
|
||||||
struct split_params {
|
struct split_params {
|
||||||
split_operation operation = SPLIT_OP_SPLIT;
|
split_operation operation = SPLIT_OP_SPLIT;
|
||||||
|
@ -177,9 +177,9 @@ struct split_strategy {
|
||||||
if (i_split == 0) {
|
if (i_split == 0) {
|
||||||
gguf_set_kv(ctx_out, ctx_gguf);
|
gguf_set_kv(ctx_out, ctx_gguf);
|
||||||
}
|
}
|
||||||
gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
|
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
|
||||||
gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
|
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
|
||||||
gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors);
|
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
|
||||||
|
|
||||||
// populate the original tensors, so we get an initial metadata
|
// populate the original tensors, so we get an initial metadata
|
||||||
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
||||||
|
@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) {
|
||||||
ctx_metas.push_back(ctx_meta);
|
ctx_metas.push_back(ctx_meta);
|
||||||
|
|
||||||
if (i_split == 0) {
|
if (i_split == 0) {
|
||||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
|
||||||
if (key_n_split < 0) {
|
if (key_n_split < 0) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"\n%s: input file does not contain %s metadata\n",
|
"\n%s: input file does not contain %s metadata\n",
|
||||||
__func__,
|
__func__,
|
||||||
LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
LLM_KV_SPLIT_COUNT);
|
||||||
gguf_free(ctx_gguf);
|
gguf_free(ctx_gguf);
|
||||||
ggml_free(ctx_meta);
|
ggml_free(ctx_meta);
|
||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
|
@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Do not trigger merge if we try to merge again the output
|
// Do not trigger merge if we try to merge again the output
|
||||||
gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
|
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
|
||||||
|
|
||||||
// Set metadata from the first split
|
// Set metadata from the first split
|
||||||
gguf_set_kv(ctx_out, ctx_gguf);
|
gguf_set_kv(ctx_out, ctx_gguf);
|
||||||
|
|
245
llama.cpp
245
llama.cpp
|
@ -2819,22 +2819,24 @@ struct llama_model_loader {
|
||||||
|
|
||||||
std::vector<std::unique_ptr<llama_mmap>> mappings;
|
std::vector<std::unique_ptr<llama_mmap>> mappings;
|
||||||
|
|
||||||
// Holds information on a tensor data source location.
|
// Holds information on a model weights
|
||||||
struct llama_tensor_offset {
|
struct llama_tensor_weights {
|
||||||
uint16_t idx; // source file index
|
uint16_t idx; // source file index
|
||||||
size_t offs; // tensor data offset in the original file
|
size_t offs; // tensor data offset in the original file
|
||||||
|
|
||||||
llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
|
ggml_tensor * tensor;
|
||||||
|
|
||||||
|
llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
||||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset across files
|
std::vector<llama_tensor_weights> weights;
|
||||||
|
|
||||||
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
struct gguf_context * ctx_gguf = NULL;
|
struct gguf_context * meta = NULL;
|
||||||
struct ggml_context * ctx_meta = NULL;
|
std::vector<ggml_context *> contexts;
|
||||||
|
|
||||||
std::string arch_name;
|
std::string arch_name;
|
||||||
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||||
|
@ -2845,128 +2847,91 @@ struct llama_model_loader {
|
||||||
trace = atoi(getenv("LLAMA_TRACE"));
|
trace = atoi(getenv("LLAMA_TRACE"));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
|
||||||
/*.no_alloc = */ true,
|
|
||||||
/*.ctx = */ &ctx_meta,
|
|
||||||
};
|
|
||||||
|
|
||||||
if (param_overrides_p != nullptr) {
|
if (param_overrides_p != nullptr) {
|
||||||
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
||||||
kv_overrides.insert({std::string(p->key), *p});
|
kv_overrides.insert({std::string(p->key), *p});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
struct ggml_context * ctx = NULL;
|
||||||
if (!ctx_gguf) {
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ &ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
meta = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
if (!meta) {
|
||||||
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
||||||
}
|
}
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
||||||
|
|
||||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||||
|
|
||||||
// Save tensors data offset of the main file.
|
// Save tensors data offset of the main file.
|
||||||
// For subsidiary files, gguf_ctx tensor data offset must not be used,
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||||
// we build a unified tensors offset index.
|
// so we build a unified tensors index for weights.
|
||||||
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) {
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf));
|
weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
|
||||||
}
|
}
|
||||||
|
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||||
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
uint16_t n_split = 0;
|
uint16_t n_split = 0;
|
||||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||||
|
|
||||||
// Build virtual GGUF/GGML contexts to represent all tensors across files
|
// Load additional GGML contexts
|
||||||
if (n_split > 1) {
|
if (n_split > 1) {
|
||||||
uint16_t idx = 0;
|
uint16_t idx = 0;
|
||||||
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
|
||||||
if (idx != 0) {
|
if (idx != 0) {
|
||||||
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
|
||||||
}
|
}
|
||||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
|
||||||
|
|
||||||
char split_prefix[PATH_MAX] = {0};
|
char split_prefix[PATH_MAX] = {0};
|
||||||
if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
|
if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
|
||||||
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t mem_size = n_tensors*ggml_tensor_overhead();
|
|
||||||
struct ggml_init_params pdata = {
|
|
||||||
/*.mem_size = */ mem_size,
|
|
||||||
/*.mem_buffer = */ NULL,
|
|
||||||
/*.no_alloc = */ true,
|
|
||||||
};
|
|
||||||
|
|
||||||
auto * new_ctx_meta = ggml_init(pdata);
|
|
||||||
|
|
||||||
if (trace > 0) {
|
if (trace > 0) {
|
||||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; idx < n_split; idx++) {
|
|
||||||
char split_path[PATH_MAX] = {0};
|
char split_path[PATH_MAX] = {0};
|
||||||
struct ggml_context * split_ctx_meta = NULL;
|
for (idx = 1; idx < n_split; idx++) {
|
||||||
struct gguf_context * split_ctx_gguf = NULL;
|
|
||||||
if (idx == 0) {
|
|
||||||
split_ctx_gguf = ctx_gguf;
|
|
||||||
split_ctx_meta = ctx_meta;
|
|
||||||
strcpy(split_path, fname.c_str());
|
|
||||||
} else {
|
|
||||||
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
||||||
|
|
||||||
struct gguf_init_params split_params = {
|
struct gguf_init_params split_params = {
|
||||||
/*.no_alloc = */ true,
|
/*.no_alloc = */ true,
|
||||||
/*.ctx = */ &split_ctx_meta,
|
/*.ctx = */ &ctx,
|
||||||
};
|
};
|
||||||
split_ctx_gguf = gguf_init_from_file(split_path, split_params);
|
struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
|
||||||
if (!split_ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ok = true;
|
// Save tensors data offset info of the shard.
|
||||||
for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) {
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne);
|
weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
|
||||||
ok = ok && copy != NULL;
|
|
||||||
|
|
||||||
if (!ok) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_name(copy, tensor->name);
|
|
||||||
|
|
||||||
// Add the tensor to the main gguf context if not already present
|
|
||||||
if (idx > 0) {
|
|
||||||
gguf_add_tensor(ctx_gguf, copy);
|
|
||||||
tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ok) {
|
|
||||||
throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (idx > 0) {
|
|
||||||
files.emplace_back(new llama_file(split_path, "rb"));
|
files.emplace_back(new llama_file(split_path, "rb"));
|
||||||
gguf_free(split_ctx_gguf);
|
contexts.emplace_back(ctx);
|
||||||
ggml_free(split_ctx_meta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_free(ctx_meta);
|
gguf_free(ctx_gguf);
|
||||||
ctx_meta = new_ctx_meta;
|
}
|
||||||
|
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||||
|
GGML_ASSERT(n_tensors == (int) weights.size());
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
|
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
n_kv = gguf_get_n_kv(ctx_gguf);
|
n_kv = gguf_get_n_kv(meta);
|
||||||
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
n_tensors = weights.size();
|
||||||
|
|
||||||
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
|
fver = (enum llama_fver ) gguf_get_version(meta);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
for (auto & w : weights) {
|
||||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
n_elements += ggml_nelements(w.tensor);
|
||||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
n_bytes += ggml_nbytes(w.tensor);
|
||||||
n_elements += ggml_nelements(t);
|
|
||||||
n_bytes += ggml_nbytes(t);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||||
|
@ -2981,7 +2946,8 @@ struct llama_model_loader {
|
||||||
enum ggml_type type_max = GGML_TYPE_F32;
|
enum ggml_type type_max = GGML_TYPE_F32;
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
for (int i = 0; i < n_tensors; i++) {
|
||||||
enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
|
const ggml_tensor * tensor = weights.at(i).tensor;
|
||||||
|
enum ggml_type type = tensor->type;
|
||||||
|
|
||||||
n_type[type]++;
|
n_type[type]++;
|
||||||
|
|
||||||
|
@ -2991,8 +2957,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (trace > 0) {
|
if (trace > 0) {
|
||||||
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
|
||||||
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3028,22 +2993,22 @@ struct llama_model_loader {
|
||||||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||||
|
|
||||||
{
|
{
|
||||||
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
|
const int kid = gguf_find_key(meta, "general.file_type");
|
||||||
if (kid >= 0) {
|
if (kid >= 0) {
|
||||||
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
|
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||||
for (int i = 0; i < n_kv; i++) {
|
for (int i = 0; i < n_kv; i++) {
|
||||||
const char * name = gguf_get_key(ctx_gguf, i);
|
const char * name = gguf_get_key(meta, i);
|
||||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
const enum gguf_type type = gguf_get_kv_type(meta, i);
|
||||||
const std::string type_name =
|
const std::string type_name =
|
||||||
type == GGUF_TYPE_ARRAY
|
type == GGUF_TYPE_ARRAY
|
||||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
|
||||||
: gguf_type_name(type);
|
: gguf_type_name(type);
|
||||||
|
|
||||||
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
std::string value = gguf_kv_to_str(meta, i);
|
||||||
const size_t MAX_VALUE_LEN = 40;
|
const size_t MAX_VALUE_LEN = 40;
|
||||||
if (value.size() > MAX_VALUE_LEN) {
|
if (value.size() > MAX_VALUE_LEN) {
|
||||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||||
|
@ -3072,18 +3037,18 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_model_loader() {
|
~llama_model_loader() {
|
||||||
if (ctx_gguf) {
|
if (meta) {
|
||||||
gguf_free(ctx_gguf);
|
gguf_free(meta);
|
||||||
}
|
}
|
||||||
if (ctx_meta) {
|
for (auto & ctx : contexts) {
|
||||||
ggml_free(ctx_meta);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||||
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
||||||
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
const int kid = gguf_find_key(meta, key.c_str());
|
||||||
|
|
||||||
if (kid < 0) {
|
if (kid < 0) {
|
||||||
if (required) {
|
if (required) {
|
||||||
|
@ -3093,7 +3058,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct GGUFMeta::ArrayInfo arr_info =
|
struct GGUFMeta::ArrayInfo arr_info =
|
||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
||||||
|
|
||||||
|
|
||||||
result = arr_info.length;
|
result = arr_info.length;
|
||||||
|
@ -3113,7 +3078,7 @@ struct llama_model_loader {
|
||||||
const struct llama_model_kv_override * override =
|
const struct llama_model_kv_override * override =
|
||||||
it != kv_overrides.end() ? &it->second : nullptr;
|
it != kv_overrides.end() ? &it->second : nullptr;
|
||||||
|
|
||||||
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
||||||
|
|
||||||
if (required && !found) {
|
if (required && !found) {
|
||||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||||
|
@ -3136,20 +3101,29 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * get_tensor_name(int i) const {
|
const char * get_tensor_name(int i) const {
|
||||||
return gguf_get_tensor_name(ctx_gguf, i);
|
return weights.at(i).tensor->name;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_tensor_weights & get_weights(const char * name) const {
|
||||||
|
for (const auto & weight : weights) {
|
||||||
|
if (strcmp(name, weight.tensor->name) == 0) {
|
||||||
|
return weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw std::runtime_error(format("tensor %s not found", name));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
struct ggml_tensor * get_tensor_meta(const char * name) const {
|
||||||
return ggml_get_tensor(ctx_meta, name);
|
return get_weights(name).tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_meta(int i) const {
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
||||||
return get_tensor_meta(get_tensor_name(i));
|
return get_tensor_meta(get_tensor_name(i));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
||||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||||
ggml_set_name(tensor, ggml_get_name(meta));
|
ggml_set_name(tensor, ggml_get_name(cur));
|
||||||
|
|
||||||
n_created++;
|
n_created++;
|
||||||
|
|
||||||
|
@ -3157,7 +3131,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
if (!required) {
|
if (!required) {
|
||||||
|
@ -3207,9 +3181,8 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
// compute the total size of all tensors for progress reporting
|
// compute the total size of all tensors for progress reporting
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
for (auto & w : weights) {
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
size_data += ggml_nbytes(w.tensor);
|
||||||
size_data += ggml_nbytes(cur);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3221,28 +3194,28 @@ struct llama_model_loader {
|
||||||
*last = 0;
|
*last = 0;
|
||||||
*addr = mapping->addr;
|
*addr = mapping->addr;
|
||||||
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||||
const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor));
|
const auto & w = get_weights(ggml_get_name(tensor));
|
||||||
*first = std::min(*first, tensor_off.offs);
|
*first = std::min(*first, w.offs);
|
||||||
*last = std::max(*last, tensor_off.offs + ggml_nbytes(tensor));
|
*last = std::max(*last, w.offs + ggml_nbytes(tensor));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// for backwards compatibility, does not support ggml-backend
|
// for backwards compatibility, does not support ggml-backend
|
||||||
void load_data_for(struct ggml_tensor * cur) const {
|
void load_data_for(struct ggml_tensor * cur) const {
|
||||||
const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
|
const auto & w = get_weights(ggml_get_name(cur));
|
||||||
|
|
||||||
if (use_mmap && t_offs.idx < mappings.size()) {
|
if (use_mmap && w.idx < mappings.size()) {
|
||||||
const auto & mapping = mappings.at(t_offs.idx);
|
const auto & mapping = mappings.at(w.idx);
|
||||||
if (cur->data == nullptr) {
|
if (cur->data == nullptr) {
|
||||||
cur->data = (uint8_t *)mapping->addr + t_offs.offs;
|
cur->data = (uint8_t *)mapping->addr + w.offs;
|
||||||
} else {
|
} else {
|
||||||
memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur));
|
memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(cur->data != nullptr);
|
GGML_ASSERT(cur->data != nullptr);
|
||||||
GGML_ASSERT(t_offs.idx < files.size());
|
GGML_ASSERT(w.idx < files.size());
|
||||||
const auto & file = files.at(t_offs.idx);
|
const auto & file = files.at(w.idx);
|
||||||
file->seek(t_offs.offs, SEEK_SET);
|
file->seek(w.offs, SEEK_SET);
|
||||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3263,39 +3236,39 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
|
const auto & w = get_weights(ggml_get_name(cur));
|
||||||
size_t n_size = ggml_nbytes(cur);
|
size_t n_size = ggml_nbytes(cur);
|
||||||
|
|
||||||
if (use_mmap && t_offs.idx < mappings.size()) {
|
if (use_mmap && w.idx < mappings.size()) {
|
||||||
const auto & mapping = mappings.at(t_offs.idx);
|
const auto & mapping = mappings.at(w.idx);
|
||||||
ggml_backend_buffer_t buf_mmap = nullptr;
|
ggml_backend_buffer_t buf_mmap = nullptr;
|
||||||
if (bufs_mmap.size() > 1) {
|
if (bufs_mmap.size() > 1) {
|
||||||
buf_mmap = bufs_mmap[t_offs.idx];
|
buf_mmap = bufs_mmap[w.idx];
|
||||||
} else if (!bufs_mmap.empty()) {
|
} else if (!bufs_mmap.empty()) {
|
||||||
buf_mmap = bufs_mmap.front();
|
buf_mmap = bufs_mmap.front();
|
||||||
}
|
}
|
||||||
if (buf_mmap && cur->data == nullptr) {
|
if (buf_mmap && cur->data == nullptr) {
|
||||||
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs);
|
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
|
||||||
if (lmlocks) {
|
if (lmlocks) {
|
||||||
const auto & lmlock = lmlocks->at(t_offs.idx);
|
const auto & lmlock = lmlocks->at(w.idx);
|
||||||
lmlock->grow_to(t_offs.offs + ggml_nbytes(cur));
|
lmlock->grow_to(w.offs + ggml_nbytes(cur));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & mmap_used = mmaps_used[t_offs.idx];
|
auto & mmap_used = mmaps_used[w.idx];
|
||||||
mmap_used.first = std::min(mmap_used.first, t_offs.offs);
|
mmap_used.first = std::min(mmap_used.first, w.offs);
|
||||||
mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size);
|
mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size);
|
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(t_offs.idx < files.size());
|
GGML_ASSERT(w.idx < files.size());
|
||||||
const auto & file = files.at(t_offs.idx);
|
const auto & file = files.at(w.idx);
|
||||||
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
file->seek(t_offs.offs, SEEK_SET);
|
file->seek(w.offs, SEEK_SET);
|
||||||
file->read_raw(cur->data, ggml_nbytes(cur));
|
file->read_raw(cur->data, ggml_nbytes(cur));
|
||||||
} else {
|
} else {
|
||||||
read_buf.resize(ggml_nbytes(cur));
|
read_buf.resize(ggml_nbytes(cur));
|
||||||
file->seek(t_offs.offs, SEEK_SET);
|
file->seek(w.offs, SEEK_SET);
|
||||||
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
file->read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
||||||
}
|
}
|
||||||
|
@ -3447,7 +3420,7 @@ static void llm_load_hparams(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
llama_model & model) {
|
llama_model & model) {
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
const gguf_context * ctx = ml.ctx_gguf;
|
const gguf_context * ctx = ml.meta;
|
||||||
|
|
||||||
// get metadata as string
|
// get metadata as string
|
||||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||||
|
@ -3837,7 +3810,7 @@ static void llm_load_vocab(
|
||||||
llama_model & model) {
|
llama_model & model) {
|
||||||
auto & vocab = model.vocab;
|
auto & vocab = model.vocab;
|
||||||
|
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
struct gguf_context * ctx = ml.meta;
|
||||||
|
|
||||||
const auto kv = LLM_KV(model.arch);
|
const auto kv = LLM_KV(model.arch);
|
||||||
|
|
||||||
|
@ -4447,7 +4420,7 @@ static bool llm_load_tensors(
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) {
|
||||||
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
||||||
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
||||||
}
|
}
|
||||||
|
@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv (ctx_out, ml.ctx_gguf);
|
gguf_set_kv (ctx_out, ml.meta);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||||
|
|
||||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||||
|
|
||||||
const std::string name = ggml_get_name(meta);
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
|
@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||||
gguf_add_tensor(ctx_out, meta);
|
gguf_add_tensor(ctx_out, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal(
|
||||||
|
|
||||||
ggml_tensor * base_t;
|
ggml_tensor * base_t;
|
||||||
if (ml) {
|
if (ml) {
|
||||||
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
|
if (!ml->get_tensor_meta(base_name.c_str())) {
|
||||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue