diff --git a/.gitignore b/.gitignore index 62b6b8b1a..b485665da 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ models-mnt /train-text-from-scratch /tokenize /vdot +/merge /common/build-info.cpp arm_neon.h compile_commands.json diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp index b1ecc123c..593f2032b 100644 --- a/examples/merge/merge.cpp +++ b/examples/merge/merge.cpp @@ -12,32 +12,107 @@ // usage: -// ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2 +// ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2 // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable); - printf(" LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n"); - //printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n"); - printf(" The embedding layer of the first model will be used"); + printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable); + printf(" CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n"); + printf(" Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n"); + printf(" The embedding layer of the first model will be used\n"); + printf(" NOTE: currently, only F16 model type is supported\n"); exit(1); } +inline std::vector str_split(std::string str, const std::string & delimiter) { + size_t pos = 0; + std::string token; + std::vector output; + while ((pos = str.find(delimiter)) != std::string::npos) { + token = str.substr(0, pos); + output.push_back(token); + str.erase(0, pos + delimiter.length()); + } + output.push_back(str); // the rest + return output; +} + +static std::vector parse_config(std::string & input) { + std::vector configs; + auto intervals = str_split(input, ","); + for (auto & interval : intervals) { + auto components = str_split(interval, "*"); + if (components.empty()) { + throw std::runtime_error("Config is incorrect"); + } + float scale = components.size() == 2 + ? std::stof(components[1]) + : 0.5; // be default + auto p0p1 = str_split(components[0], "-"); + if (p0p1.empty()) { + throw std::runtime_error("Layer interval is invalid"); + } + int p0 = std::stoi(p0p1[0]); + int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0; + if (p0 > p1) { + throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)"); + } + for (int i = p0; i <= p1; i++) { + struct llama_merge_config conf{i, scale, scale}; + configs.push_back(conf); + } + // TODO: maybe check for overlap intervals? + } + return configs; +} + int main(int argc, char ** argv) { llama_backend_init(); - llama_model_params model_params = llama_model_default_params(); - std::vector configs; - for (int i = 0; i < 100; i++) { - struct llama_merge_config conf{i, 0.0, 0.0}; - configs.push_back(conf); + + if (argc < 6) { + usage(argv[0]); } + + std::string fname_model1(argv[1]); + std::string config_model1(argv[2]); + std::string fname_model2(argv[3]); + std::string config_model2(argv[4]); + std::string fname_output(argv[5]); + + // TODO: add try catch + auto configs1 = parse_config(config_model1); + auto configs2 = parse_config(config_model2); + std::vector configs; + + if (configs1.size() != configs2.size()) { + fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size()); + } + + // merge 2 configs + printf("Merge configs:\n"); + for (auto c1 : configs1) { + float scale2 = -1; + for (auto c2 : configs2) { + if (c2.i_layer == c1.i_layer) { + scale2 = c2.scale2; + } + } + if (scale2 < 0) { + fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer); + exit(1); + } + struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2}; + configs.push_back(conf); + + printf(" Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2); + } + llama_merge_models( - "", - "", + fname_model1.c_str(), + fname_model2.c_str(), configs.data(), - 100, - "/tmp/dolphin-test-merge.gguf" + configs.size(), + fname_output.c_str() ); - std::cout << "done\n"; llama_backend_free(); } \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index 16aaffdee..cf09416ea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal( // process layers for (int i = 0; i < ml1.n_tensors; ++i) { - struct ggml_init_params params = { - /*.mem_size =*/ 1000u*ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - ggml_context * ctx_ggml = ggml_init(params); - struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i); std::vector> buf1; const std::string name = ggml_get_name(tensor1); @@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal( std::vector> buf2; struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2); - struct ggml_tensor * result; + // GGML_TYPE_F16 + std::vector> result(tensor_size); if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) { LLAMA_LOG_ERROR("Tensor shapes are different\n"); + return -1; } - int i_layer; + int i_layer = -1; if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) { // non-layer, simply copy read_tensor_data(tensor1, ml1, buf1); - result = tensor1; // no change + memcpy(result.data(), tensor1->data, tensor_size); } else { - LLAMA_LOG_INFO("i_layer %d\n", i_layer); + auto conf = get_config_for_layer(i_layer); read_tensor_data(tensor1, ml1, buf1); read_tensor_data(tensor2, ml2, buf2); - auto conf = get_config_for_layer(i_layer); - struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml); - struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1); - struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2); - t1 = ggml_cpy(ctx_ggml, tensor1, t1); - t2 = ggml_cpy(ctx_ggml, tensor2, t2); - t1 = ggml_scale(ctx_ggml, t1, conf->scale1); - t2 = ggml_scale(ctx_ggml, t2, conf->scale2); - result = ggml_add(ctx_ggml, t1, t2); - ggml_build_forward_expand(gf, result); - ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt"); - ggml_graph_compute_with_ctx(ctx_ggml, gf, 1); - } + LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2); - LLAMA_LOG_INFO("i_layer %d ===\n", i_layer); + if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) { + for (size_t i = 0; i < result.size() / sizeof(float); i++) { + float * t1 = (float *) tensor1->data; + float * t2 = (float *) tensor2->data; + float * dest = (float *) result.data(); + dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2; + } + } else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) { + for (size_t i = 0; i < result.size() / sizeof(double); i++) { + double * t1 = (double *) tensor1->data; + double * t2 = (double *) tensor2->data; + double * dest = (double *) result.data(); + dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2; + } + } else { + LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type)); + return -1; + } + } LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n", i + 1, ml1.n_tensors, - ggml_get_name(result), - llama_format_tensor_shape(result).c_str(), - ggml_type_name(result->type)); - - //std::vector> tensor_data(tensor_size); - //ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size); + ggml_get_name(tensor1), + llama_format_tensor_shape(tensor1).c_str(), + ggml_type_name(tensor1->type)); // write tensor data + padding - const char * buf = (const char *) result->data; - printf("%d %d\n", buf[0], buf[1]); - fout.write((const char *) result->data, tensor_size); + fout.write((const char *) result.data(), tensor_size); zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size); - ggml_free(ctx_ggml); - - if (i > 3) break; } // go back to beginning of file and write the updated meta data diff --git a/merge b/merge deleted file mode 100755 index 3ec40a933..000000000 Binary files a/merge and /dev/null differ