llama : minor updates
ggml-ci
This commit is contained in:
parent
0ec27ad66c
commit
7bbbf38c32
1 changed files with 43 additions and 35 deletions
76
llama.cpp
76
llama.cpp
|
@ -1109,11 +1109,11 @@ static bool llama_kv_cache_init(
|
||||||
// model loading and saving
|
// model loading and saving
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_file_version {
|
enum llama_fver {
|
||||||
GGUF_FILE_VERSION_V1 = 1,
|
GGUF_FILE_VERSION_V1 = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * llama_file_version_name(llama_file_version version) {
|
static const char * llama_file_version_name(llama_fver version) {
|
||||||
switch (version) {
|
switch (version) {
|
||||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
|
||||||
}
|
}
|
||||||
|
@ -1150,7 +1150,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
llama_file file;
|
llama_file file;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
llama_file_version fver;
|
llama_fver fver;
|
||||||
|
|
||||||
std::unique_ptr<llama_mmap> mapping;
|
std::unique_ptr<llama_mmap> mapping;
|
||||||
|
|
||||||
|
@ -1171,7 +1171,7 @@ struct llama_model_loader {
|
||||||
n_kv = gguf_get_n_kv(ctx_gguf);
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
||||||
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
|
|
||||||
fver = (enum llama_file_version) gguf_get_version(ctx_gguf);
|
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
for (int i = 0; i < n_tensors; i++) {
|
||||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
||||||
|
@ -1268,6 +1268,21 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string get_arch_name() const {
|
||||||
|
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
||||||
|
|
||||||
|
std::string arch_name;
|
||||||
|
GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
||||||
|
|
||||||
|
return arch_name;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum llm_arch get_arch() const {
|
||||||
|
const std::string arch_name = get_arch_name();
|
||||||
|
|
||||||
|
return llm_arch_from_string(arch_name);
|
||||||
|
}
|
||||||
|
|
||||||
const char * get_tensor_name(int i) const {
|
const char * get_tensor_name(int i) const {
|
||||||
return gguf_get_tensor_name(ctx_gguf, i);
|
return gguf_get_tensor_name(ctx_gguf, i);
|
||||||
}
|
}
|
||||||
|
@ -1480,16 +1495,9 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||||
struct gguf_context * ctx = ml.ctx_gguf;
|
model.arch = ml.get_arch();
|
||||||
|
|
||||||
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
||||||
|
|
||||||
std::string arch_name;
|
|
||||||
GGUF_GET_KEY(ctx, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
|
||||||
|
|
||||||
model.arch = llm_arch_from_string(arch_name);
|
|
||||||
if (model.arch == LLM_ARCH_UNKNOWN) {
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
||||||
throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
|
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4048,13 +4056,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
||||||
|
|
||||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv (ctx_out, model_loader->ctx_gguf);
|
gguf_set_kv (ctx_out, ml->ctx_gguf);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||||
|
|
||||||
|
@ -4062,8 +4070,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
int n_attention_wv = 0;
|
int n_attention_wv = 0;
|
||||||
int n_feed_forward_w2 = 0;
|
int n_feed_forward_w2 = 0;
|
||||||
|
|
||||||
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
|
struct ggml_tensor * meta = ml->get_tensor_meta(i);
|
||||||
|
|
||||||
const std::string name = ggml_get_name(meta);
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
|
@ -4097,8 +4105,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
std::vector<uint8_t> work;
|
std::vector<uint8_t> work;
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
|
struct ggml_tensor * meta = ml->get_tensor_meta(i);
|
||||||
gguf_add_tensor(ctx_out, meta);
|
gguf_add_tensor(ctx_out, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4111,17 +4119,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
// placeholder for the meta data
|
// placeholder for the meta data
|
||||||
::zeros(fout, meta_size);
|
::zeros(fout, meta_size);
|
||||||
|
|
||||||
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
struct ggml_tensor * tensor = model_loader->get_tensor_meta(i);
|
struct ggml_tensor * tensor = ml->get_tensor_meta(i);
|
||||||
|
|
||||||
const std::string name = ggml_get_name(tensor);
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
read_data.resize(ggml_nbytes(tensor));
|
read_data.resize(ggml_nbytes(tensor));
|
||||||
tensor->data = read_data.data();
|
tensor->data = read_data.data();
|
||||||
model_loader->load_data_for(tensor);
|
ml->load_data_for(tensor);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
||||||
++idx, model_loader->n_tensors,
|
++idx, ml->n_tensors,
|
||||||
ggml_get_name(tensor),
|
ggml_get_name(tensor),
|
||||||
llama_format_tensor_shape(tensor).c_str(),
|
llama_format_tensor_shape(tensor).c_str(),
|
||||||
ggml_type_name(tensor->type));
|
ggml_type_name(tensor->type));
|
||||||
|
@ -4147,7 +4155,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
const auto tn = LLM_TN(ml->get_arch());
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
|
@ -4386,28 +4394,28 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
}
|
}
|
||||||
|
|
||||||
// load base model
|
// load base model
|
||||||
std::unique_ptr<llama_model_loader> model_loader;
|
std::unique_ptr<llama_model_loader> ml;
|
||||||
ggml_context * base_ctx = NULL;
|
ggml_context * base_ctx = NULL;
|
||||||
std::vector<uint8_t> base_buf;
|
std::vector<uint8_t> base_buf;
|
||||||
if (path_base_model) {
|
if (path_base_model) {
|
||||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
||||||
|
|
||||||
size_t ctx_size;
|
size_t ctx_size;
|
||||||
size_t mmapped_size;
|
size_t mmapped_size;
|
||||||
model_loader->calc_sizes(ctx_size, mmapped_size);
|
ml->calc_sizes(ctx_size, mmapped_size);
|
||||||
base_buf.resize(ctx_size);
|
base_buf.resize(ctx_size);
|
||||||
|
|
||||||
ggml_init_params base_params;
|
ggml_init_params base_params;
|
||||||
base_params.mem_size = base_buf.size();
|
base_params.mem_size = base_buf.size();
|
||||||
base_params.mem_buffer = base_buf.data();
|
base_params.mem_buffer = base_buf.data();
|
||||||
base_params.no_alloc = model_loader->use_mmap;
|
base_params.no_alloc = ml->use_mmap;
|
||||||
|
|
||||||
base_ctx = ggml_init(base_params);
|
base_ctx = ggml_init(base_params);
|
||||||
|
|
||||||
// maybe this should in llama_model_loader
|
// maybe this should in llama_model_loader
|
||||||
if (model_loader->use_mmap) {
|
if (ml->use_mmap) {
|
||||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4511,8 +4519,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
ggml_tensor * base_t;
|
ggml_tensor * base_t;
|
||||||
if (model_loader) {
|
if (ml) {
|
||||||
struct gguf_context * ctx_gguf = model_loader->ctx_gguf;
|
struct gguf_context * ctx_gguf = ml->ctx_gguf;
|
||||||
|
|
||||||
// load from base model
|
// load from base model
|
||||||
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||||
|
@ -4522,8 +4530,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: not tested!! maybe not working!
|
// TODO: not tested!! maybe not working!
|
||||||
base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||||
model_loader->load_data_for(base_t);
|
ml->load_data_for(base_t);
|
||||||
} else {
|
} else {
|
||||||
base_t = dest_t;
|
base_t = dest_t;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue