From f09188e9d88797bad4ff5b81292e20cf1f7b1e02 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 1 Mar 2024 17:43:58 +0100 Subject: [PATCH] merge: add debug msg --- examples/merge/merge.cpp | 5 +++++ llama.cpp | 37 +++++++++++++++---------------------- llama.h | 5 +++-- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp index 054bf8180..12a2ede14 100644 --- a/examples/merge/merge.cpp +++ b/examples/merge/merge.cpp @@ -78,8 +78,10 @@ static std::vector parse_config(std::string & config_p buf_scales.resize(lines.size()*n_models); // process line by line, one line is one layer + std::cout << "Parsing configurations:\n"; std::vector layers; for (size_t i_layer = 0; i_layer < lines.size(); i_layer++) { + std::cout << "- Layer " << i_layer << " =" << std::flush; auto columns = str_split(lines[i_layer], ","); if (columns.size() != n_models*2) { std::stringstream ss; @@ -91,8 +93,11 @@ static std::vector parse_config(std::string & config_p for (size_t i_model = 0; i_model < n_models; i_model++) { srcs[i_model] = std::stoi(columns[i_model*2]); scales[i_model] = std::stof(columns[i_model*2 + 1]); + // debug message + std::cout << " + model[" << i_model << "].layer[" << srcs[i_model] << "]*" << scales[i_model] << std::flush; } layers.push_back(llama_merge_layer{srcs, scales}); + std::cout << "\n"; } return layers; } diff --git a/llama.cpp b/llama.cpp index 071d69067..5aa2a1db1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11345,11 +11345,6 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { return ss.str(); }; - // if the input model model is quantized, it will be dequant to FP16 - auto get_output_type = [&](struct ggml_tensor * t) { - return ggml_is_quantized(t->type) ? GGML_TYPE_F16 : t->type; - }; - // remember to call before exit auto clean_up = [&]() { fout.close(); @@ -11388,12 +11383,12 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers); printf("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers); - // read input layers + // read input layers, process firstly non-layer tensors (embedding, output,...) for (int i = 0; i < mls[0]->n_tensors; i++) { struct ggml_tensor * meta = mls[0]->get_tensor_meta(i); int i_layer = get_i_layer(ggml_get_name(meta)); if (i_layer < 0) { - // populate data for non-layers tensor + // populate data for non-layer tensors (embedding, output,...) struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE); memcpy(out_tensor, meta, GGML_TENSOR_SIZE); // copy metadata (shape, type,...) gguf_add_tensor(ctx_out, out_tensor); @@ -11415,7 +11410,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { struct ggml_tensor * ref_tensor = mls[0]->get_tensor_meta(ref_name.c_str()); // get ref tensor from layer 0 memcpy(out_tensor, ref_tensor, GGML_TENSOR_SIZE); // copy metadata (shape, type,...) ggml_set_name(out_tensor, get_name(i_layer, ref_name).c_str()); // set the correct name (with correct i_layer) - out_tensor->type = get_output_type(ref_tensor); // maybe dequant + // if the input tensor is quantized, it will be dequant to FP16 + out_tensor->type = ggml_is_quantized(ref_tensor->type) ? GGML_TYPE_F16 : ref_tensor->type; output_tensors.push_back(out_tensor); gguf_add_tensor(ctx_out, out_tensor); LLAMA_LOG_INFO("%s\n", ggml_get_name(out_tensor)); @@ -11490,6 +11486,15 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { }; size_t n_done = 0; + auto log_step = [&](const struct ggml_tensor * tensor) { + n_done++; + LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n", + n_done, output_tensors.size(), + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); + }; + // process non-layer output tensor for (auto & out_tensor : output_tensors) { std::string name = ggml_get_name(out_tensor); @@ -11506,14 +11511,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { } read_tensor_data(in_tensor, *mls[0], buf); // read from first model - n_done++; - LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n", - n_done, output_tensors.size(), - name.c_str(), - llama_format_tensor_shape(out_tensor).c_str(), - ggml_type_name(out_tensor->type)); - // write tensor data + padding + log_step(out_tensor); fout.write((const char *) buf.data(), buf.size()); zeros(fout, GGML_PAD(buf.size(), GGUF_DEFAULT_ALIGNMENT) - buf.size()); } @@ -11550,14 +11549,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { } } - n_done++; - LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], output type = %6s\n", - n_done, output_tensors.size(), - out_name.c_str(), - llama_format_tensor_shape(out_tensor).c_str(), - ggml_type_name(out_tensor->type)); - // write tensor data + padding + log_step(out_tensor); fout.write((const char *) out_buf.data(), out_buf.size()); zeros(fout, GGML_PAD(out_buf.size(), GGUF_DEFAULT_ALIGNMENT) - out_buf.size()); } diff --git a/llama.h b/llama.h index 23cabe291..068de4192 100644 --- a/llama.h +++ b/llama.h @@ -329,8 +329,9 @@ extern "C" { // used to merge models struct llama_merge_layer { - const int * srcs; // contains n_models elements - const float * scales; // contains n_models elements + const int * srcs; // contains n_models elements, if nullptr then we reuse other layer + const float * scales; // contains n_models elements, if nullptr then we reuse other layer + const int i_layer_reuse; // if != -1, then reuse earlier layer in the model to reduce output size }; struct llama_merge_config {