diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp index 07c2f3d84..054bf8180 100644 --- a/examples/merge/merge.cpp +++ b/examples/merge/merge.cpp @@ -10,8 +10,15 @@ #include #include +struct merge_params { + std::string config_path = "merge.csv"; + std::vector model_paths; + std::string output_path = "gguf-merged-f16.gguf"; +}; + [[noreturn]] static void usage(const char * executable, int exit_code) { + struct merge_params defaults; printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable); printf("\n"); printf("Merging 2 models and change layers configuration.\n"); @@ -23,20 +30,20 @@ static void usage(const char * executable, int exit_code) { printf("- ...\n"); printf("\n"); printf("For example:\n"); - printf("0,1.0,0,0.0 meaning: output layer 0 = A[0]*1.0 + B[0] * 0.0\n"); - printf("0,1.0,0,0.0 meaning: output layer 1 = A[0]*1.0 + B[0] * 0.0\n"); - printf("1,0.0,2,0.0 meaning: output layer 2 = A[1]*0.0 + B[2] * 0.0\n"); - printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1] * 0.5\n"); + printf("0,1.0,0,0.0 meaning: output layer 0 = A[0]*1.0 + B[0]*0.0\n"); + printf("0,1.0,0,0.0 meaning: output layer 1 = A[0]*1.0 + B[0]*0.0\n"); + printf("1,0.0,2,0.0 meaning: output layer 2 = A[1]*0.0 + B[2]*0.0\n"); + printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1]*0.5\n"); printf("\n"); printf("NOTE:\n"); - printf("- The embedding layer of the first model will be used\n"); - printf("- Currently, only F16 model type is supported\n"); + printf("- The embedding and output layers of the first model will be used.\n"); + printf("- Currently, we accept both quantized and non-quantized models as input, but only output FP16 model. To re-quantize it, please use \"quantize\" tool.\n"); printf("\n"); printf("Options:\n"); printf(" -h, --help Show this help message and exit\n"); - printf(" -c, --config CONFIG_FILE Path to config file (CSV format)\n"); + printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str()); printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n"); - printf(" -o, --output OUTPUT_FILE Path to the output model\n"); + printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str()); printf("\n"); printf("Example: ./merge -c config.csv -o output.gguf -m model_a.gguf -m model_b.gguf\n"); exit(exit_code); @@ -92,9 +99,7 @@ static std::vector parse_config(std::string & config_p int main(int argc, char ** argv) { bool invalid_param = false; - std::string config_path; - std::vector model_paths; - std::string output_path; + struct merge_params params; std::string arg; for (int i = 1; i < argc; i++) { @@ -106,40 +111,36 @@ int main(int argc, char ** argv) { invalid_param = true; break; } - config_path = argv[i]; + params.config_path = argv[i]; } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; break; } - model_paths.push_back(argv[i]); + params.model_paths.push_back(argv[i]); } else if (arg == "-o" || arg == "--output") { if (++i >= argc) { invalid_param = true; break; } - output_path = argv[i]; + params.output_path = argv[i]; } } try { if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } else if (config_path.empty()) { - throw std::invalid_argument("error: missing config path"); - } else if (model_paths.size() < 2) { + } else if (params.model_paths.size() < 2) { throw std::invalid_argument("error: require at least 2 models"); - } else if (output_path.empty()) { - throw std::invalid_argument("error: missing output path"); } // buffers to hold allocated data std::vector buf_srcs; std::vector buf_scales; - auto layers = parse_config(config_path, model_paths.size(), buf_srcs, buf_scales); + auto layers = parse_config(params.config_path, params.model_paths.size(), buf_srcs, buf_scales); std::vector p_model_paths; - for (auto & m : model_paths) { + for (auto & m : params.model_paths) { p_model_paths.push_back(m.data()); } const struct llama_merge_config config{ @@ -147,7 +148,7 @@ int main(int argc, char ** argv) { p_model_paths.size(), layers.data(), layers.size(), - output_path.data(), + params.output_path.data(), }; llama_merge_models(&config); diff --git a/llama.cpp b/llama.cpp index f48cc6935..071d69067 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11309,8 +11309,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -int32_t llama_merge_models(const struct llama_merge_config * config) -{ +int32_t llama_merge_models(const struct llama_merge_config * config) { #if defined(__linux__) || defined(_WIN32) constexpr bool use_mmap = true; #else @@ -11346,6 +11345,11 @@ int32_t llama_merge_models(const struct llama_merge_config * config) return ss.str(); }; + // if the input model model is quantized, it will be dequant to FP16 + auto get_output_type = [&](struct ggml_tensor * t) { + return ggml_is_quantized(t->type) ? GGML_TYPE_F16 : t->type; + }; + // remember to call before exit auto clean_up = [&]() { fout.close(); @@ -11379,8 +11383,10 @@ int32_t llama_merge_models(const struct llama_merge_config * config) gguf_set_kv(ctx_out, mls[0]->ctx_gguf); // correct layer count for output model - // TODO: is this key "llama.block_count" this the same for all architectures? - gguf_set_val_u32(ctx_out, "llama.block_count", config->n_layers); + std::stringstream ss; + ss << mls[0]->get_arch_name() << ".block_count"; + gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers); + printf("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers); // read input layers for (int i = 0; i < mls[0]->n_tensors; i++) { @@ -11409,10 +11415,12 @@ int32_t llama_merge_models(const struct llama_merge_config * config) struct ggml_tensor * ref_tensor = mls[0]->get_tensor_meta(ref_name.c_str()); // get ref tensor from layer 0 memcpy(out_tensor, ref_tensor, GGML_TENSOR_SIZE); // copy metadata (shape, type,...) ggml_set_name(out_tensor, get_name(i_layer, ref_name).c_str()); // set the correct name (with correct i_layer) + out_tensor->type = get_output_type(ref_tensor); // maybe dequant output_tensors.push_back(out_tensor); gguf_add_tensor(ctx_out, out_tensor); LLAMA_LOG_INFO("%s\n", ggml_get_name(out_tensor)); } + // TODO: how to reuse tensor (duplicated layers)? we can play with ctx->infos[tensor_idx].offset } const size_t meta_size = gguf_get_meta_size(ctx_out); @@ -11436,27 +11444,47 @@ int32_t llama_merge_models(const struct llama_merge_config * config) }; // TODO: maybe we should use ggml_add and ggml_scale? and how? - auto calc_output_tensor = [&](enum ggml_type type, std::vector> & in_buf, float scale, std::vector> & out_buf) { - GGML_ASSERT(in_buf.size() == out_buf.size()); + auto calc_output_tensor = [&](struct ggml_tensor * in_tensor, float scale, std::vector> & out_buf) { + enum ggml_type type = in_tensor->type; + const size_t nelements = ggml_nelements(in_tensor); + std::vector> tmp_buf; + void * in_buf = in_tensor->data; + // TODO: if the tensor is quantized, we dequantize to FP16 then to FP32, do the calculation and re-quant to FP16. how can we simplify? + if (ggml_is_quantized(type)) { + // dequantize it to FP32 + std::vector workers; + int nthread = std::thread::hardware_concurrency(); + workers.reserve(nthread); + std::vector> f32_conv_buf; + llama_convert_tensor_internal(in_tensor, f32_conv_buf, workers, nelements, nthread); + // quantize back to FP16 + type = GGML_TYPE_F16; + tmp_buf.resize(nelements * sizeof(ggml_fp16_t)); + for (size_t i = 0; i < nelements; i++) { + ggml_fp16_t * dest = (ggml_fp16_t *) tmp_buf.data(); + dest[i] = ggml_fp32_to_fp16(f32_conv_buf[i].value); + } + in_buf = tmp_buf.data(); + } + // then the code block below will calculate the merged tensor if (type == GGML_TYPE_F16) { - GGML_ASSERT(in_buf.size() % sizeof(ggml_fp16_t) == 0); - for (size_t i = 0; i < in_buf.size() / sizeof(ggml_fp16_t); i++) { - ggml_fp16_t * in = (ggml_fp16_t *) in_buf.data(); + out_buf.resize(nelements * sizeof(ggml_fp16_t)); + for (size_t i = 0; i < nelements; i++) { + ggml_fp16_t * in = (ggml_fp16_t *) in_buf; ggml_fp16_t * dest = (ggml_fp16_t *) out_buf.data(); float in_dequant = ggml_fp16_to_fp32(in[i]); float res = in_dequant * scale; dest[i] = ggml_fp32_to_fp16(res); } } else if (type == GGML_TYPE_F32) { - GGML_ASSERT(in_buf.size() % sizeof(float) == 0); - for (size_t i = 0; i < in_buf.size() / sizeof(float); i++) { - float * in = (float *) in_buf.data(); + out_buf.resize(nelements * sizeof(float)); + for (size_t i = 0; i < nelements; i++) { + float * in = (float *) in_buf; float * dest = (float *) out_buf.data(); dest[i] = in[i] * scale; } } else { - LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(type)); - return -1; // return of lambda, no need clean up + GGML_ASSERT(false); // should never reach here } return 0; // return of lambda, no need clean up }; @@ -11479,7 +11507,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) read_tensor_data(in_tensor, *mls[0], buf); // read from first model n_done++; - LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], type = %6s\n", + LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n", n_done, output_tensors.size(), name.c_str(), llama_format_tensor_shape(out_tensor).c_str(), @@ -11492,8 +11520,8 @@ int32_t llama_merge_models(const struct llama_merge_config * config) // process layer output tensor for (auto & out_tensor : output_tensors) { - std::vector> in_buf(ggml_nbytes(out_tensor)); - std::vector> out_buf(ggml_nbytes(out_tensor)); + std::vector> in_buf; + std::vector> out_buf; std::string out_name = ggml_get_name(out_tensor); int i_layer_out = get_i_layer(out_name.c_str()); @@ -11515,7 +11543,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) return -1; } read_tensor_data(in_tensor, *mls[i_model], in_buf); - res = calc_output_tensor(in_tensor->type, in_buf, scale, out_buf); + res = calc_output_tensor(in_tensor, scale, out_buf); if (res < 0) { clean_up(); return res; @@ -11523,7 +11551,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) } n_done++; - LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], type = %6s\n", + LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], output type = %6s\n", n_done, output_tensors.size(), out_name.c_str(), llama_format_tensor_shape(out_tensor).c_str(),