diff --git a/Makefile b/Makefile index f03faf6ed..cb01ac902 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o + simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o \ + merge # Binaries only useful for tests TEST_TARGETS = \ @@ -699,6 +700,10 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml. $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +merge: examples/merge/merge.cpp build-info.o ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/merge/CMakeLists.txt b/examples/merge/CMakeLists.txt new file mode 100644 index 000000000..93df1a643 --- /dev/null +++ b/examples/merge/CMakeLists.txt @@ -0,0 +1,6 @@ +set(TARGET merge) +add_executable(${TARGET} merge.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../common) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp index 4091eb918..b1ecc123c 100644 --- a/examples/merge/merge.cpp +++ b/examples/merge/merge.cpp @@ -11,93 +11,6 @@ #include -int32_t merge( - const std::string & fname_inp1, - const std::vector scale1, - const std::string & fname_inp2, - const std::vector scale2, - const int n_layers, - const std::string & fname_out) { -#if defined(__linux__) || defined(_WIN32) - constexpr bool use_mmap = true; -#else - constexpr bool use_mmap = false; -#endif - - llama_model_loader ml(fname_inp1, use_mmap, NULL); - ml.init_mapping(false); // no prefetching? - - llama_model model; - llm_load_arch(ml, model); - llm_load_hparams(ml, model); - - struct gguf_context * ctx_out = gguf_init_empty(); - // copy the KV pairs from the input file - gguf_set_kv(ctx_out, ml.ctx_gguf); - - // populate the original tensors so we get an initial meta data - for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); - gguf_add_tensor(ctx_out, meta); - } - - std::ofstream fout(fname_out, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - - const size_t meta_size = gguf_get_meta_size(ctx_out); - - LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); - - // placeholder for the meta data - ::zeros(fout, meta_size); - - std::vector> read_data; - - for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * tensor = ml.get_tensor_meta(i); - - const std::string name = ggml_get_name(tensor); - - if (!ml.use_mmap) { - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); - } - tensor->data = read_data.data(); - } - ml.load_data_for(tensor); - - size_t new_size = ggml_nbytes(tensor); - void * new_data = tensor->data; - - LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml.n_tensors, - ggml_get_name(tensor), - llama_format_tensor_shape(tensor).c_str(), - ggml_type_name(tensor->type)); - - // update the gguf meta data as we go - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); - - // write tensor data + padding - fout.write((const char *) new_data, new_size); - zeros(fout, GGML_PAD(new_size, GGUF_DEFAULT_ALIGNMENT) - new_size); - } - - // go back to beginning of file and write the updated meta data - { - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *) data.data(), data.size()); - } - - fout.close(); - - gguf_free(ctx_out); -} - - // usage: // ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2 // @@ -113,6 +26,18 @@ static void usage(const char * executable) { int main(int argc, char ** argv) { llama_backend_init(); llama_model_params model_params = llama_model_default_params(); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + std::vector configs; + for (int i = 0; i < 100; i++) { + struct llama_merge_config conf{i, 0.0, 0.0}; + configs.push_back(conf); + } + llama_merge_models( + "", + "", + configs.data(), + 100, + "/tmp/dolphin-test-merge.gguf" + ); + std::cout << "done\n"; llama_backend_free(); } \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index 37477e6ef..16aaffdee 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11043,6 +11043,188 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } +static int32_t llama_merge_models_internal( + const std::string & fname_inp1, + const std::string & fname_inp2, + const std::vector & configs, + const std::string & fname_out) +{ +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model model1; + llama_model model2; + llama_model_loader ml1(fname_inp1, use_mmap, NULL); + llama_model_loader ml2(fname_inp2, use_mmap, NULL); + + auto load_model = [](llama_model_loader & ml, llama_model & model) { + ml.init_mapping(false); + llm_load_arch(ml, model); + llm_load_hparams(ml, model); + }; + load_model(ml1, model1); + load_model(ml2, model2); + + if (model1.hparams != model2.hparams) { + LLAMA_LOG_ERROR("hparams of two models are different, aborting..."); + return -1; + } + + struct gguf_context * ctx_out = gguf_init_empty(); + std::ofstream fout(fname_out, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + { + // copy the KV pairs from the input file + gguf_set_kv(ctx_out, ml1.ctx_gguf); + + // populate the original tensors so we get an initial meta data + for (int i = 0; i < ml1.n_tensors; ++i) { + struct ggml_tensor * meta = ml1.get_tensor_meta(i); + gguf_add_tensor(ctx_out, meta); + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + + LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); + + // placeholder for the meta data + ::zeros(fout, meta_size); + } + + auto read_tensor_data = [&](struct ggml_tensor * tensor, llama_model_loader & ml, std::vector> & buf) -> size_t { + if (!ml.use_mmap) { + if (buf.size() < ggml_nbytes(tensor)) { + buf.resize(ggml_nbytes(tensor)); + } + tensor->data = buf.data(); + } + ml.load_data_for(tensor); + return ggml_nbytes(tensor); + }; + + // map tensor name to its index for ml2 + std::unordered_map ml2_name_to_idx; + for (int i = 0; i < ml2.n_tensors; ++i) { + struct ggml_tensor * tensor = ml1.get_tensor_meta(i); + const std::string name = ggml_get_name(tensor); + ml2_name_to_idx[name] = i; + } + + auto get_config_for_layer = [&](int i_layer) -> const struct llama_merge_config* { + for (auto & conf : configs) { + if (conf->i_layer == i_layer) { + return conf; + } + } + LLAMA_LOG_ERROR("Cannot find llama_merge_config for i_layer=%d\n", i_layer); + return nullptr; + }; + + // process layers + for (int i = 0; i < ml1.n_tensors; ++i) { + struct ggml_init_params params = { + /*.mem_size =*/ 1000u*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx_ggml = ggml_init(params); + + struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i); + std::vector> buf1; + const std::string name = ggml_get_name(tensor1); + const size_t tensor_size = ggml_nbytes(tensor1); + + int idx_ml2 = ml2_name_to_idx[name]; + std::vector> buf2; + struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2); + + struct ggml_tensor * result; + + if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) { + LLAMA_LOG_ERROR("Tensor shapes are different\n"); + } + + int i_layer; + if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) { + // non-layer, simply copy + read_tensor_data(tensor1, ml1, buf1); + result = tensor1; // no change + } else { + LLAMA_LOG_INFO("i_layer %d\n", i_layer); + read_tensor_data(tensor1, ml1, buf1); + read_tensor_data(tensor2, ml2, buf2); + auto conf = get_config_for_layer(i_layer); + struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml); + struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1); + struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2); + t1 = ggml_cpy(ctx_ggml, tensor1, t1); + t2 = ggml_cpy(ctx_ggml, tensor2, t2); + t1 = ggml_scale(ctx_ggml, t1, conf->scale1); + t2 = ggml_scale(ctx_ggml, t2, conf->scale2); + result = ggml_add(ctx_ggml, t1, t2); + ggml_build_forward_expand(gf, result); + ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt"); + ggml_graph_compute_with_ctx(ctx_ggml, gf, 1); + } + + LLAMA_LOG_INFO("i_layer %d ===\n", i_layer); + + LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n", + i + 1, ml1.n_tensors, + ggml_get_name(result), + llama_format_tensor_shape(result).c_str(), + ggml_type_name(result->type)); + + //std::vector> tensor_data(tensor_size); + //ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size); + + // write tensor data + padding + const char * buf = (const char *) result->data; + printf("%d %d\n", buf[0], buf[1]); + fout.write((const char *) result->data, tensor_size); + zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size); + ggml_free(ctx_ggml); + + if (i > 3) break; + } + + // go back to beginning of file and write the updated meta data + { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *) data.data(), data.size()); + } + + fout.close(); + + gguf_free(ctx_out); + return 0; +} + +int32_t llama_merge_models( + const char * fname_inp1, + const char * fname_inp2, + const struct llama_merge_config * configs, + const int n_configs, + const char * fname_out) +{ + std::vector v_configs(n_configs); + for (int i = 0; i < n_configs; i++) { + v_configs[i] = &configs[i]; + } + return llama_merge_models_internal( + fname_inp1, + fname_inp2, + v_configs, + fname_out + ); +} + static int llama_apply_lora_from_file_internal( const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads ) { diff --git a/llama.h b/llama.h index 84f196b3b..3ab54a19a 100644 --- a/llama.h +++ b/llama.h @@ -312,6 +312,13 @@ extern "C" { const char * content; } llama_chat_message; + struct llama_merge_config { + const int i_layer; + const float scale1; + const float scale2; + // TODO add support for embeding and output layers + }; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -401,6 +408,13 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + LLAMA_API int32_t llama_merge_models( + const char * fname_inp1, + const char * fname_inp2, + const struct llama_merge_config * configs, + const int n_configs, + const char * fname_out); + // Apply a LoRA adapter to a loaded model // path_base_model is the path to a higher quality model to use as a base for // the layers modified by the adapter. Can be NULL to use the current loaded model. diff --git a/merge b/merge new file mode 100755 index 000000000..3ec40a933 Binary files /dev/null and b/merge differ