From 65730438aaa431012f7351807826a5924fa9a60d Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 3 Mar 2024 16:06:48 +0100 Subject: [PATCH] wip: new format --- Makefile | 2 +- examples/merge/CMakeLists.txt | 2 +- examples/merge/config.example.csv | 32 ---- examples/merge/config.example.txt | 38 +++++ examples/merge/merge.cpp | 121 +++++--------- examples/merge/parser.hpp | 269 ++++++++++++++++++++++++++++++ llama.cpp | 15 +- llama.h | 34 +++- 8 files changed, 389 insertions(+), 124 deletions(-) delete mode 100644 examples/merge/config.example.csv create mode 100644 examples/merge/config.example.txt create mode 100644 examples/merge/parser.hpp diff --git a/Makefile b/Makefile index c01cc8eb7..7d1d5c83d 100644 --- a/Makefile +++ b/Makefile @@ -705,7 +705,7 @@ quantize: examples/quantize/quantize.cpp build-info.o ggml. $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -merge: examples/merge/merge.cpp build-info.o ggml.o llama.o $(OBJS) +merge: examples/merge/merge.cpp examples/merge/parser.hpp build-info.o ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/merge/CMakeLists.txt b/examples/merge/CMakeLists.txt index 93df1a643..787ea86c3 100644 --- a/examples/merge/CMakeLists.txt +++ b/examples/merge/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET merge) -add_executable(${TARGET} merge.cpp) +add_executable(${TARGET} merge.cpp parser.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) diff --git a/examples/merge/config.example.csv b/examples/merge/config.example.csv deleted file mode 100644 index 23681bf08..000000000 --- a/examples/merge/config.example.csv +++ /dev/null @@ -1,32 +0,0 @@ -0,0.1,0,0.9 -1,0.1,1,0.9 -2,0.1,2,0.9 -3,0.1,3,0.9 -4,0.1,4,0.9 -5,0.1,5,0.9 -6,0.1,6,0.9 -7,0.1,7,0.9 -8,0.1,8,0.9 -9,0.1,9,0.9 -10,0.5,10,0.5 -11,0.5,11,0.5 -12,0.5,12,0.5 -13,0.5,13,0.5 -14,0.5,14,0.5 -15,0.5,15,0.5 -16,0.5,16,0.5 -17,0.5,17,0.5 -18,0.5,18,0.5 -19,0.5,19,0.5 -20,0.5,20,0.5 -21,0.5,21,0.5 -22,0.5,22,0.5 -23,0.5,23,0.5 -24,0.5,24,0.5 -25,0.5,25,0.5 -26,0.5,26,0.5 -27,0.5,27,0.5 -28,0.5,28,0.5 -29,0.5,29,0.5 -20,0.5,20,0.5 -31,0.5,31,0.5 \ No newline at end of file diff --git a/examples/merge/config.example.txt b/examples/merge/config.example.txt new file mode 100644 index 000000000..631fd4c50 --- /dev/null +++ b/examples/merge/config.example.txt @@ -0,0 +1,38 @@ +# GGUF merge instructions +# +# Lines start with "#" will be comment +# Empty lines will be ignored +# The "output layer" instruction is to add a new layer for output model +# Merge instruction is in format: target (space) verb (space) parameters +# Supported verbs: +# - linear: merge linearly, parameters: source_layer,source_layer,t +# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale +# - repeat: repeat a layer in the same output model (to reduce file size) +# +# For example: +# +# This is the first layer of output model: +# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1) +# Except for "attn_output" tensor that we want t=0.5 instead t=0.1 + +output layer 0 +all slerp 0,0,0.9 +attn_output slerp 0,0,0.9 + +# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4 +# Except for "attn_output" tensor that we want to use slerp with t=0.9 + +output layer 1 +all linear 1,1,0.6,0.4 +attn_output slerp 1,1,0.9 + +output layer 2 +all linear 2,2,1.0,0.0 + +# repeat the first layers defined earlier in this file + +output layer 3 +all repeat 0 + +output layer 4 +all repeat 1 diff --git a/examples/merge/merge.cpp b/examples/merge/merge.cpp index d178d82bf..06b8de486 100644 --- a/examples/merge/merge.cpp +++ b/examples/merge/merge.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "parser.hpp" #include #include @@ -10,10 +11,14 @@ #include #include +static const size_t n_models = 2; // hard-limited to 2 input models for now + struct merge_params { - std::string config_path = "merge.csv"; + std::string config_path = "config.txt"; std::vector model_paths; std::string output_path = "ggml-merged-f16.gguf"; + bool only_list_tensors_name = false; + bool dry_run = false; }; [[noreturn]] @@ -21,21 +26,11 @@ static void usage(const char * executable, int exit_code) { struct merge_params defaults; printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable); printf("\n"); - printf("Merging 2 models and change layers configuration.\n"); - printf("Merge config format is CSV, without header, one line represents one layer of the output model, columns in the order below:\n"); - printf("- Model A layer\n"); - printf("- Model A scale\n"); - printf("- Model B layer\n"); - printf("- Model B scale\n"); - printf("- ...\n"); - printf("\n"); - printf("For example:\n"); - printf("0,1.0,0,0.0 meaning: output layer 0 = A[0]*1.0 + B[0]*0.0\n"); - printf("0,1.0,0,0.0 meaning: output layer 1 = A[0]*1.0 + B[0]*0.0\n"); - printf("1,0.0,2,0.0 meaning: output layer 2 = A[1]*0.0 + B[2]*0.0\n"); - printf("2,0.5,1,0.5 meaning: output layer 3 = A[2]*0.5 + B[1]*0.5\n"); + printf("Merging multiple models, inspired by mergekit.\n"); + printf("For more details, see \"config.example.txt\" file.\n"); printf("\n"); printf("NOTE:\n"); + printf("- Only support merging 2 models.\n"); printf("- The embedding and output layers of the first model will be used.\n"); printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n"); printf("\n"); @@ -44,64 +39,13 @@ static void usage(const char * executable, int exit_code) { printf(" -c, --config CONFIG_FILE Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str()); printf(" -m, --model MODEL_PATH Path to model. This option can be repeated multiple times and must be specified in the right order.\n"); printf(" -o, --output OUTPUT_FILE Path to the output model (default: %s)\n", defaults.output_path.c_str()); + printf(" --dry-run Only print out list of parsed and exit, useful for debugging\n"); + printf(" --print-list-tensor Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n"); printf("\n"); - printf("Example: ./merge -c config.csv -o output.gguf -m model_a.gguf -m model_b.gguf\n"); + printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n"); exit(exit_code); } -inline std::vector str_split(std::string str, const std::string & delimiter) { - size_t pos = 0; - std::string token; - std::vector output; - while ((pos = str.find(delimiter)) != std::string::npos) { - token = str.substr(0, pos); - output.push_back(token); - str.erase(0, pos + delimiter.length()); - } - output.push_back(str); // the rest - return output; -} - -static std::vector parse_config(std::string & config_path, size_t n_models, std::vector & buf_srcs, std::vector & buf_scales) { - // read file - std::ifstream file(config_path); - if (!file.is_open()) { - throw std::runtime_error("Unable to open file merge config file"); - } - std::ostringstream content; - content << file.rdbuf(); // Read the entire file into the stringstream - file.close(); - - // allocate memory - auto lines = str_split(content.str(), "\n"); - buf_srcs.resize(lines.size()*n_models); - buf_scales.resize(lines.size()*n_models); - - // process line by line, one line is one layer - std::cout << "Parsing configurations:\n"; - std::vector layers; - for (size_t i_layer = 0; i_layer < lines.size(); i_layer++) { - std::cout << "- Layer " << i_layer << " =" << std::flush; - auto columns = str_split(lines[i_layer], ","); - if (columns.size() != n_models*2) { - std::stringstream ss; - ss << "error: line " << i_layer+1 << " is malformed. Expect to have exactly " << n_models*2 << " columns, but got " << columns.size() << " columns"; - throw std::runtime_error(ss.str()); - } - int * srcs = buf_srcs.data() + i_layer*n_models; - float * scales = buf_scales.data() + i_layer*n_models; - for (size_t i_model = 0; i_model < n_models; i_model++) { - srcs[i_model] = std::stoi(columns[i_model*2]); - scales[i_model] = std::stof(columns[i_model*2 + 1]); - // debug message - std::cout << " + model[" << i_model << "].layer[" << srcs[i_model] << "]*" << scales[i_model] << std::flush; - } - layers.push_back(llama_merge_layer{srcs, scales, -1}); - std::cout << "\n"; - } - return layers; -} - int main(int argc, char ** argv) { bool invalid_param = false; struct merge_params params; @@ -129,37 +73,54 @@ int main(int argc, char ** argv) { break; } params.output_path = argv[i]; + } else if (arg == "--print-list-tensor") { + params.only_list_tensors_name = true; + } else if (arg == "--dry-run") { + params.dry_run = true; } } try { if (invalid_param) { + usage(argv[0], 1); throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } else if (params.model_paths.size() < 2) { + } else if (!params.only_list_tensors_name && params.model_paths.size() < 2) { throw std::invalid_argument("error: require at least 2 models"); } - // buffers to hold allocated data - std::vector buf_srcs; - std::vector buf_scales; + if (params.only_list_tensors_name) { + if (params.model_paths.size() != 1) { + throw std::invalid_argument("error: we can only list tensors of one single model"); + } + print_model_tensors_name(params.model_paths[0]); + return 0; // exit now + } + + size_t n_layers = 0; + auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers); + + if (params.dry_run) { + return 0; + } - auto layers = parse_config(params.config_path, params.model_paths.size(), buf_srcs, buf_scales); std::vector p_model_paths; for (auto & m : params.model_paths) { p_model_paths.push_back(m.data()); } - const struct llama_merge_config config{ - p_model_paths.data(), - p_model_paths.size(), - layers.data(), - layers.size(), - params.output_path.data(), + struct llama_merge_config config{ + { + params.model_paths[0].c_str(), + params.model_paths[1].c_str(), + }, + instructions.data(), + instructions.size(), + n_layers, + params.output_path.c_str(), }; llama_merge_models(&config); } catch (const std::exception & ex) { std::cerr << ex.what() << "\n\n"; - usage(argv[0], 1); } return 0; diff --git a/examples/merge/parser.hpp b/examples/merge/parser.hpp new file mode 100644 index 000000000..356c20250 --- /dev/null +++ b/examples/merge/parser.hpp @@ -0,0 +1,269 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// trim whitespace from the beginning and end of a string +static std::string str_trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && isspace(str[start])) { + start += 1; + } + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + return str.substr(start, end - start); +} + +inline std::vector str_split(std::string str, const std::string & delimiter) { + size_t pos = 0; + std::string token; + std::vector output; + while ((pos = str.find(delimiter)) != std::string::npos) { + token = str.substr(0, pos); + output.push_back(token); + str.erase(0, pos + delimiter.length()); + } + output.push_back(str); // the rest + return output; +} + +///////////////////////////////// + +// dump a list of tensor name of the input model +static std::vector get_list_tensors_name(std::string & model_path) { + llama_model_params model_params = llama_model_default_params(); + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); + size_t n_tensors = llama_get_all_tensors_name(model, nullptr, 0); + std::vector list(n_tensors, nullptr); + llama_get_all_tensors_name(model, list.data(), list.size()); + // copy the result + std::vector results; + for (auto & name : list) { + results.push_back(std::string(name)); + } + llama_free_model(model); + return results; +} + +static void print_model_tensors_name(std::string & model_path) { + auto tensors = get_list_tensors_name(model_path); + std::cout << "\n\n===================\n"; + std::cout << "Total number of tensors: " << tensors.size() << "\n"; + std::vector list(tensors.size(), nullptr); + for (size_t i = 0; i < tensors.size(); i++) { + char buf[128]; + sprintf(buf, "%4ld: %s", i, tensors[i].c_str()); + std::cout << buf << "\n"; + } +} + +///////////////////////////////// + +// get layer index from tensor name, for example "blk.x.attn_norm.weight" +// returns -1 if it is non-layer +static int get_i_layer(std::string tensor_name) { + int i_layer = -1; + return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1; +}; + +static void print_inst(struct llama_merge_inst inst) { + std::cout << "Output: " << inst.name << "\n"; + switch (inst.method) { + case LLAMA_MERGE_LINEAR: + std::cout << " Linear\n"; + std::cout << " Model A: " << inst.scales[0] << " * " << inst.srcs[0] << "\n"; + std::cout << " Model B: " << inst.scales[1] << " * " << inst.srcs[1] << "\n"; + break; + case LLAMA_MERGE_SLERP: + std::cout << " SLERP\n"; + std::cout << " t=" << inst.t << "\n"; + std::cout << " Model A: " << inst.srcs[0] << "\n"; + std::cout << " Model B: " << inst.srcs[1] << "\n"; + break; + case LLAMA_MERGE_COPY: + std::cout << " Copy from model A: "<< inst.srcs[0] << "\n"; + break; + case LLAMA_MERGE_REPEAT: + std::cout << " Repeat from output model: " << inst.srcs[0] << "\n"; + break; + default: + break; + } +} + +static std::vector parse_config(std::string & config_path, std::string & model_path, size_t & n_layers) { + std::vector instructions; + + // read file + std::ifstream file(config_path); + if (!file.is_open()) { + throw std::runtime_error("Unable to open file merge config file"); + } + std::ostringstream content; + content << file.rdbuf(); // Read the entire file into the stringstream + auto lines = str_split(content.str(), "\n"); + file.close(); + + // get list of input tensors + auto inp_names = get_list_tensors_name(model_path); + std::set units; // name of units, for example "attn_output" + for (auto & name : inp_names) { + int il = get_i_layer(name); + if (il < 0) { + // non-layer, only copy + struct llama_merge_inst ins; + ins.method = LLAMA_MERGE_COPY; + strcpy(ins.name, name.c_str()); + strcpy(ins.srcs[0], name.c_str()); + instructions.push_back(ins); + } else { + // tensor belong to layer + auto parts = str_split(name, "."); + units.insert(parts[2]); + } + } + + std::cout << "List of units:\n"; + for (auto & u : units) std::cout << u << "\n"; + std::cout << "\n"; + + // process line by line, one line is one layer + std::unordered_map layer; // map tensor name to instruction + bool is_layer_empty = true; + int i_layer = -1; + auto get_tensor_name = [&](int layer, std::string unit) { + return "blk." + std::to_string(layer) + "." + unit + ".weight"; + }; + auto push_output_layer = [&]() { + if (!is_layer_empty) { + for (auto & it : layer) { + instructions.push_back(it.second); + } + } + layer.clear(); + is_layer_empty = true; + }; + auto new_output_layer = [&]() { + layer.clear(); + for (auto & u : units) { + struct llama_merge_inst ins; + strcpy(ins.name, get_tensor_name(i_layer, u).c_str()); + layer[u] = ins; + } + }; + + auto raise_err = [&](size_t i_line, std::string message) { + std::stringstream ss; + ss << "Parse error: (line " << i_line + 1 << ") " << message; + throw std::runtime_error(ss.str()); + }; + + for (size_t i_line = 0 ; i_line < lines.size(); i_line++) { + auto line = str_trim(lines[i_line]); + if (line.empty() || line.c_str()[0] == '#') { + continue; // skip empty line or comment + } + + auto parts = str_split(line, " "); + if (parts.size() != 3) { + raise_err(i_line, "does not follow format: \"target (space) verb (space) arguments\""); + } + + auto target = parts[0]; + auto verb = parts[1]; + auto params = str_split(parts[2], ","); + + if (target == "output" && verb == "layer") { + int il_curr = std::stoi(params[0]); + if (i_layer + 1 != il_curr) { + raise_err(i_line, "new layer number must be (last layer number + 1)"); + } + push_output_layer(); + i_layer = il_curr; + new_output_layer(); + continue; + } + + auto linear = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 4) { + raise_err(i_line, "verb \"linear\" requires exactly 4 params"); + } + ins.method = LLAMA_MERGE_LINEAR; + int src0 = std::stoi(params[0]); + int src1 = std::stoi(params[1]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str()); + ins.scales[0] = std::stof(params[2]); + ins.scales[1] = std::stof(params[3]); + is_layer_empty = false; + }; + + auto slerp = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 3) { + raise_err(i_line, "verb \"slerp\" requires exactly 3 params"); + } + ins.method = LLAMA_MERGE_SLERP; + int src0 = std::stoi(params[0]); + int src1 = std::stoi(params[1]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str()); + ins.t = std::stof(params[2]); + is_layer_empty = false; + }; + + auto repeat = [&](struct llama_merge_inst & ins, std::string unit) { + if (params.size() != 1) { + raise_err(i_line, "verb \"repeat\" requires exactly 1 param"); + } + ins.method = LLAMA_MERGE_REPEAT; + int src0 = std::stoi(params[0]); + strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str()); + is_layer_empty = false; + }; + + auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) { + if (verb == "linear") { + linear(ins, unit); + } else if (verb == "slerp") { + slerp(ins, unit); + } else if (verb == "repeat") { + repeat(ins, unit); + } else { + raise_err(i_line, "invalid verb: " + verb); + } + }; + + if (target == "all") { + for (auto & u : units) { + apply_verb(layer[u], u); + } + } else { + if (units.find(target) == units.end()) { + raise_err(i_line, "unit " + target + " does not exist"); + } + apply_verb(layer[target], target); + } + } + push_output_layer(); + n_layers = i_layer + 1; + + // print all parsed instructions + std::cout << "Parsed instructions:\n"; + for (auto & ins : instructions) { + print_inst(ins); + } + std::cout << "---\n" << "Total output layers: " << n_layers << "\n"; + + return instructions; +} diff --git a/llama.cpp b/llama.cpp index de058bb6b..e77a13806 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11358,7 +11358,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { #else constexpr bool use_mmap = false; #endif - +/* // std::move doesn't work with llama_model and llama_model_loader, why? std::vector> models; std::vector> mls; @@ -11610,6 +11610,7 @@ int32_t llama_merge_models(const struct llama_merge_config * config) { } clean_up(); +*/ return 0; } @@ -12479,6 +12480,18 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return nparams; } +int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size) { + size_t i = 0; + for (const auto & it : model->tensors_by_name) { + if (i == arr_size) { + break; + } + name_arr[i] = it.first.c_str(); + i++; + } + return model->tensors_by_name.size(); +} + struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), [name](const std::pair & it) { diff --git a/llama.h b/llama.h index 068de4192..a6fb7b8fa 100644 --- a/llama.h +++ b/llama.h @@ -327,18 +327,30 @@ extern "C" { const char * content; } llama_chat_message; - // used to merge models - struct llama_merge_layer { - const int * srcs; // contains n_models elements, if nullptr then we reuse other layer - const float * scales; // contains n_models elements, if nullptr then we reuse other layer - const int i_layer_reuse; // if != -1, then reuse earlier layer in the model to reduce output size + enum llama_merge_method { + LLAMA_MERGE_LINEAR, + LLAMA_MERGE_SLERP, + LLAMA_MERGE_REPEAT, + LLAMA_MERGE_COPY, }; + // instruction for merging tensors (model merge) + struct llama_merge_inst { + char name[GGML_MAX_NAME]; // name of output tensor + enum llama_merge_method method; + // we only support 2 models for now + char srcs[2][GGML_MAX_NAME]; // name of input tensors + float scales[2]; // for linear method + float t; // for slerp method + }; + + // merge models struct llama_merge_config { - const char ** model_paths; - const size_t n_models; - const struct llama_merge_layer * layers; - const size_t n_layers; + // we only support 2 models for now + const char * model_paths[2]; + const struct llama_merge_inst * insts; + const size_t n_insts; + const size_t n_layers; // number of output layers const char * output_path; }; @@ -420,6 +432,9 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Get a list of model tensor name, returns number of elements + LLAMA_API int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size); + // Get a llama model tensor LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); @@ -429,6 +444,7 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); + // Merge multiple models, inspired by mergekit LLAMA_API int32_t llama_merge_models( const struct llama_merge_config * config);