Merge 10c477b8a8 into c78541479c

2024-03-10 16:49:07 +02:00 · 2024-03-10 16:49:07 +02:00 · ee6854a71e
commit ee6854a71e
parent c78541479c 10c477b8a8
8 changed files with 940 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -72,6 +72,7 @@ models-mnt
 /train-text-from-scratch
 /tokenize
 /vdot
+/merge
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json
--- a/9
+++ b/9
@ -1,8 +1,9 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search     \
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o  \
+	merge

 # Binaries only useful for tests
 TEST_TARGETS = \
@ -708,6 +709,10 @@ quantize: examples/quantize/quantize.cpp                      build-info.o ggml.
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+merge: examples/merge/merge.cpp examples/merge/parser.hpp     build-info.o ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/examples/merge/CMakeLists.txt
+++ b/examples/merge/CMakeLists.txt
@ -0,0 +1,6 @@
+set(TARGET merge)
+add_executable(${TARGET} merge.cpp parser.hpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/merge/config.example.txt
+++ b/examples/merge/config.example.txt
@ -0,0 +1,123 @@
+# GGUF merge instructions
+#
+# Lines start with "#" will be comment
+# Empty lines will be ignored
+# The "output layer" instruction is to add a new layer for output model
+# Merge instruction is in format: target (space) verb (space) parameters
+# Supported verbs:
+# - linear: merge linearly, parameters: source_layer,source_layer,t
+# - slerp: spherical linear interpolation, parameters: source_layer,source_layer,scale,scale
+# - copy: copy from which model, which layer
+
+
+#########################
+# Example:
+
+# This is the first layer of output model:
+# For all tensors, we want slerp(model[0].layer[0], model[1].layer[0], 0.1)
+# Except for "attn_output" tensor that we want t=0.5 instead t=0.1
+
+output layer 0
+all slerp 0,0,0.1
+attn_output slerp 0,0,0.5
+
+# For next layer, we want: model[0].layer[1]*0.6 + model[1].layer[1]*0.4
+# Except for "attn_output" tensor that we want to use slerp with t=0.9
+
+output layer 1
+all linear 1,1,0.6,0.4
+attn_output slerp 1,1,0.9
+
+# For next layer, we want to copy from model[0].layer[2]
+
+output layer 2
+all copy 0,2
+
+output layer 3
+all copy 0,3
+
+# For next layer, we want to copy from model[1].layer[4]
+
+output layer 4
+all copy 1,4
+
+output layer 5
+all copy 1,5
+
+output layer 6
+all linear 6,6,0.1,0.9
+
+output layer 7
+all linear 7,7,0.1,0.9
+
+output layer 8
+all linear 8,8,0.1,0.9
+
+output layer 9
+all linear 9,9,0.1,0.9
+
+output layer 10
+all linear 10,10,0.1,0.9
+
+output layer 11
+all linear 11,11,0.1,0.9
+
+output layer 12
+all linear 12,12,0.1,0.9
+
+output layer 13
+all linear 13,13,0.3333,0.6666
+
+output layer 14
+all linear 14,14,0.3333,0.6666
+
+output layer 15
+all linear 15,15,0.3333,0.6666
+
+output layer 16
+all linear 16,16,0.3333,0.6666
+
+output layer 17
+all linear 17,17,0.3333,0.6666
+
+output layer 18
+all linear 18,18,0.3333,0.6666
+
+output layer 19
+all linear 19,19,0.3333,0.6666
+
+output layer 20
+all slerp 20,20,0.8
+
+output layer 21
+all slerp 21,21,0.8
+
+output layer 22
+all slerp 22,22,0.8
+
+output layer 23
+all slerp 23,23,0.8
+
+output layer 24
+all slerp 24,24,0.8
+
+output layer 25
+all slerp 25,25,0.8
+
+output layer 26
+all slerp 26,26,0.8
+
+output layer 27
+all slerp 27,27,0.8
+
+output layer 28
+all slerp 28,28,0.8
+
+output layer 29
+all slerp 29,29,0.8
+
+output layer 30
+all slerp 30,30,0.8
+
+output layer 31
+all slerp 31,31,0.8
--- a/examples/merge/merge.cpp
+++ b/examples/merge/merge.cpp
@ -0,0 +1,127 @@
+#include "common.h"
+#include "llama.h"
+#include "parser.hpp"
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <fstream>
+#include <cmath>
+#include <algorithm>
+
+static const size_t n_models = 2; // hard-limited to 2 input models for now
+
+struct merge_params {
+    std::string config_path = "config.txt";
+    std::vector<std::string> model_paths;
+    std::string output_path = "ggml-merged-f16.gguf";
+    bool only_list_tensors_name = false;
+    bool dry_run = false;
+};
+
+[[noreturn]]
+static void usage(const char * executable, int exit_code) {
+    struct merge_params defaults;
+    printf("usage: %s -c CONFIG_FILE -o OUTPUT_FILE -m MODEL_PATH -m MODEL_PATH ...\n\n", executable);
+    printf("\n");
+    printf("Merging multiple models, inspired by mergekit.\n");
+    printf("For more details, see \"config.example.txt\" file.\n");
+    printf("\n");
+    printf("NOTE:\n");
+    printf("- Only support merging 2 models.\n");
+    printf("- The embedding and output layers of the first model will be used.\n");
+    printf("- Currently, we accept both quantized and non-quantized models as input. The output model will be re-quantized into the same format of the first model.\n");
+    printf("\n");
+    printf("Options:\n");
+    printf("  -h, --help                 Show this help message and exit\n");
+    printf("  -c, --config CONFIG_FILE   Path to config file, in CSV format (default: %s)\n", defaults.config_path.c_str());
+    printf("  -m, --model MODEL_PATH     Path to model. This option can be repeated multiple times and must be specified in the right order.\n");
+    printf("  -o, --output OUTPUT_FILE   Path to the output model (default: %s)\n", defaults.output_path.c_str());
+    printf("  --dry-run                  Only print out list of parsed and exit, useful for debugging\n");
+    printf("  --print-list-tensor        Only print out list of tensors of the input model, useful for debugging (only one model is accepted)\n");
+    printf("\n");
+    printf("Example: ./merge -c config.txt -o output.gguf -m model_a.gguf -m model_b.gguf\n");
+    exit(exit_code);
+}
+
+int main(int argc, char ** argv) {
+    bool invalid_param = false;
+    struct merge_params params;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg == "-h" || arg == "--help") {
+            usage(argv[0], 0);
+        } else if (arg == "-c" || arg == "--config") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.config_path = argv[i];
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_paths.push_back(argv[i]);
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.output_path = argv[i];
+        } else if (arg == "--print-list-tensor") {
+            params.only_list_tensors_name = true;
+        } else if (arg == "--dry-run") {
+            params.dry_run = true;
+        }
+    }
+
+    try {
+        if (invalid_param) {
+            usage(argv[0], 1);
+            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        } else if (!params.only_list_tensors_name && params.model_paths.size() < 2) {
+            throw std::invalid_argument("error: require at least 2 models");
+        }
+
+        if (params.only_list_tensors_name) {
+            if (params.model_paths.size() != 1) {
+                throw std::invalid_argument("error: we can only list tensors of one single model");
+            }
+            print_model_tensors_name(params.model_paths[0]);
+            return 0; // exit now
+        }
+
+        size_t n_layers = 0;
+        auto instructions = parse_config(params.config_path, params.model_paths[0], n_layers);
+
+        if (params.dry_run) {
+            return 0;
+        }
+
+        std::vector<const char*> p_model_paths;
+        for (auto & m : params.model_paths) {
+            p_model_paths.push_back(m.data());
+        }
+        struct llama_merge_config config{
+            {
+                params.model_paths[0].c_str(),
+                params.model_paths[1].c_str(),
+            },
+            instructions.data(),
+            instructions.size(),
+            n_layers,
+            params.output_path.c_str(),
+        };
+
+        llama_merge_models(&config);
+    } catch (const std::exception & ex) {
+        std::cerr << ex.what() << "\n\n";
+    }
+
+    return 0;
+}
--- a/examples/merge/parser.hpp
+++ b/examples/merge/parser.hpp
@ -0,0 +1,293 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <fstream>
+#include <cmath>
+#include <algorithm>
+#include <set>
+#include <string.h>
+
+// trim whitespace from the beginning and end of a string
+static std::string str_trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+    return str.substr(start, end - start);
+}
+
+inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
+    size_t pos = 0;
+    std::string token;
+    std::vector<std::string> output;
+    while ((pos = str.find(delimiter)) != std::string::npos) {
+        token = str.substr(0, pos);
+        output.push_back(token);
+        str.erase(0, pos + delimiter.length());
+    }
+    output.push_back(str); // the rest
+    return output;
+}
+
+/////////////////////////////////
+
+// dump a list of tensor name of the input model
+static std::vector<std::string> get_list_tensors_name(std::string & model_path) {
+    llama_model_params model_params = llama_model_default_params();
+    llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
+    size_t n_tensors = llama_get_all_tensors_name(model, nullptr, 0);
+    std::vector<const char *> list(n_tensors, nullptr);
+    llama_get_all_tensors_name(model, list.data(), list.size());
+    // copy the result
+    std::vector<std::string> results;
+    for (auto & name : list) {
+        results.push_back(std::string(name));
+    }
+    llama_free_model(model);
+    return results;
+}
+
+static void print_model_tensors_name(std::string & model_path) {
+    auto tensors = get_list_tensors_name(model_path);
+    std::cout << "\n\n===================\n";
+    std::cout << "Total number of tensors: " << tensors.size() << "\n";
+    std::vector<const char *> list(tensors.size(), nullptr);
+    for (size_t i = 0; i < tensors.size(); i++) {
+        char buf[128];
+        sprintf(buf, "%4ld: %s", i, tensors[i].c_str());
+        std::cout << buf << "\n";
+    }
+}
+
+/////////////////////////////////
+
+// get layer index from tensor name, for example "blk.x.attn_norm.weight"
+// returns -1 if it is non-layer
+static int get_i_layer(std::string tensor_name) {
+    int i_layer = -1;
+    return sscanf(tensor_name.c_str(), "blk.%d.", &i_layer) == 1 ? i_layer : -1;
+};
+
+static void print_inst(struct llama_merge_inst inst) {
+    std::cout << "Output: " << inst.name << "\n";
+    switch (inst.method) {
+        case LLAMA_MERGE_LINEAR:
+            std::cout << "    Linear\n";
+            std::cout << "    Model A: " << inst.scales[0] << " * " << inst.srcs[0] << "\n";
+            std::cout << "    Model B: " << inst.scales[1] << " * " << inst.srcs[1] << "\n";
+            break;
+        case LLAMA_MERGE_SLERP:
+            std::cout << "    SLERP\n";
+            std::cout << "    t=" << inst.t << "\n";
+            std::cout << "    Model A: " << inst.srcs[0] << "\n";
+            std::cout << "    Model B: " << inst.srcs[1] << "\n";
+            break;
+        case LLAMA_MERGE_COPY:
+            std::cout << "    Copy from model A: "<< inst.srcs[0] << "\n";
+            break;
+        case LLAMA_MERGE_REPEAT:
+            std::cout << "    Repeat from output model: " << inst.srcs[0] << "\n";
+            break;
+        default:
+            break;
+    }
+}
+
+static std::vector<struct llama_merge_inst> parse_config(std::string & config_path, std::string & model_path, size_t & n_layers) {
+    std::vector<struct llama_merge_inst> instructions;
+
+    // read file
+    std::ifstream file(config_path);
+    if (!file.is_open()) {
+        throw std::runtime_error("Unable to open file merge config file");
+    }
+    std::ostringstream content;
+    content << file.rdbuf(); // Read the entire file into the stringstream
+    auto lines = str_split(content.str(), "\n");
+    file.close();
+
+    // get list of input tensors
+    auto inp_names = get_list_tensors_name(model_path);
+    std::set<std::string> units; // name of units, for example "attn_output"
+    for (auto & name : inp_names) {
+        int il = get_i_layer(name);
+        if (il < 0) {
+            // non-layer, only copy
+            struct llama_merge_inst ins;
+            ins.method = LLAMA_MERGE_COPY;
+            strcpy(ins.name, name.c_str());
+            strcpy(ins.srcs[0], name.c_str()); // always take the first model
+            strcpy(ins.srcs[1], "");
+            instructions.push_back(ins);
+        } else {
+            // tensor belong to layer
+            auto parts = str_split(name, ".");
+            units.insert(parts[2]);
+        }
+    }
+
+    std::cout << "List of units:\n";
+    for (auto & u : units) std::cout << u << "\n";
+    std::cout << "\n";
+
+    // process line by line, one line is one layer
+    std::unordered_map<std::string, struct llama_merge_inst> layer; // map tensor name to instruction
+    bool is_layer_empty = true;
+    int i_layer = -1;
+    auto get_tensor_name = [&](int layer, std::string unit) {
+        return "blk." + std::to_string(layer) + "." + unit + ".weight";
+    };
+    auto push_output_layer = [&]() {
+        if (!is_layer_empty) {
+            for (auto & it : layer) {
+                instructions.push_back(it.second);
+            }
+        }
+        layer.clear();
+        is_layer_empty = true;
+    };
+    auto new_output_layer = [&]() {
+        layer.clear();
+        for (auto & u : units) {
+            struct llama_merge_inst ins;
+            strcpy(ins.name, get_tensor_name(i_layer, u).c_str());
+            layer[u] = ins;
+        }
+    };
+
+    auto raise_err = [&](size_t i_line, std::string message) {
+        std::stringstream ss;
+        ss << "Parse error: (line " << i_line + 1 << ") " << message;
+        throw std::runtime_error(ss.str());
+    };
+
+    for (size_t i_line = 0 ; i_line < lines.size(); i_line++) {
+        auto line = str_trim(lines[i_line]);
+        if (line.empty() || line.c_str()[0] == '#') {
+            continue; // skip empty line or comment
+        }
+
+        auto parts = str_split(line, " ");
+        if (parts.size() != 3) {
+            raise_err(i_line, "does not follow format: \"target (space) verb (space) parameters\"");
+        }
+
+        auto target = parts[0];
+        auto verb = parts[1];
+        auto params = str_split(parts[2], ",");
+
+        if (target == "output" && verb == "layer") {
+            int il_curr = std::stoi(params[0]);
+            if (i_layer + 1 != il_curr) {
+                raise_err(i_line, "new layer number must be (last layer number + 1)");
+            }
+            push_output_layer();
+            i_layer = il_curr;
+            new_output_layer();
+            continue;
+        }
+
+        auto linear = [&](struct llama_merge_inst & ins, std::string unit) {
+            if (params.size() != 4) {
+                raise_err(i_line, "verb \"linear\" requires exactly 4 parameters");
+            }
+            ins.method = LLAMA_MERGE_LINEAR;
+            int src0 = std::stoi(params[0]);
+            int src1 = std::stoi(params[1]);
+            strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
+            strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
+            ins.scales[0] = std::stof(params[2]);
+            ins.scales[1] = std::stof(params[3]);
+            is_layer_empty = false;
+        };
+
+        auto slerp = [&](struct llama_merge_inst & ins, std::string unit) {
+            if (params.size() != 3) {
+                raise_err(i_line, "verb \"slerp\" requires exactly 3 parameters");
+            }
+            ins.method = LLAMA_MERGE_SLERP;
+            int src0 = std::stoi(params[0]);
+            int src1 = std::stoi(params[1]);
+            strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
+            strcpy(ins.srcs[1], get_tensor_name(src1, unit).c_str());
+            ins.t = std::stof(params[2]);
+            is_layer_empty = false;
+        };
+
+        /*auto repeat = [&](struct llama_merge_inst & ins, std::string unit) {
+            if (params.size() != 1) {
+                raise_err(i_line, "verb \"repeat\" requires exactly 1 parameter");
+            }
+            ins.method = LLAMA_MERGE_REPEAT;
+            int src0 = std::stoi(params[0]);
+            strcpy(ins.srcs[0], get_tensor_name(src0, unit).c_str());
+            is_layer_empty = false;
+        };*/
+
+        auto copy = [&](struct llama_merge_inst & ins, std::string unit) {
+            if (params.size() != 2) {
+                raise_err(i_line, "verb \"copy\" requires exactly 2 parameters");
+            }
+            ins.method = LLAMA_MERGE_COPY;
+            int model = std::stoi(params[0]);
+            int layer = std::stoi(params[1]);
+            if (model == 0) {
+                strcpy(ins.srcs[0], get_tensor_name(layer, unit).c_str());
+                strcpy(ins.srcs[1], "");
+            } else if (model == 1) {
+                strcpy(ins.srcs[0], "");
+                strcpy(ins.srcs[1], get_tensor_name(layer, unit).c_str());
+            } else {
+                raise_err(i_line, "can only copy from model 0 or 1");
+            }
+            is_layer_empty = false;
+        };
+
+        auto apply_verb = [&](struct llama_merge_inst & ins, std::string unit) {
+            if (verb == "linear") {
+                linear(ins, unit);
+            } else if (verb == "slerp") {
+                slerp(ins, unit);
+            } else if (verb == "repeat") {
+                // repeat(ins, unit);
+                raise_err(i_line, "repeat is currently not supported");
+            } else if (verb == "copy") {
+                copy(ins, unit);
+            } else {
+                raise_err(i_line, "invalid verb: " + verb);
+            }
+        };
+
+        // TODO: what if user does not use "all"? we may miss some tensors?
+        if (target == "all") {
+            for (auto & u : units) {
+                apply_verb(layer[u], u);
+            }
+        } else {
+            if (units.find(target) == units.end()) {
+                raise_err(i_line, "unit " + target + " does not exist");
+            }
+            apply_verb(layer[target], target);
+        }
+    }
+    push_output_layer();
+    n_layers = i_layer + 1;
+
+    // print all parsed instructions
+    std::cout << "Parsed instructions:\n";
+    for (auto & ins : instructions) {
+        print_inst(ins);
+    }
+    std::cout << "---\n" << "Total output layers: " << n_layers << "\n";
+
+    return instructions;
+}
--- a/llama.cpp
+++ b/llama.cpp
@ -61,6 +61,7 @@
 #include <cfloat>
 #include <cinttypes>
 #include <climits>
+#include <condition_variable>
 #include <cmath>
 #include <cstdarg>
 #include <cstddef>
@ -86,6 +87,7 @@
 #include <thread>
 #include <type_traits>
 #include <unordered_map>
+#include <queue>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -12227,6 +12229,341 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    }
 }

+// TODO: remove this when #5830 is merged
+static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+    std::mutex mutex;
+    int counter = 0;
+    size_t new_size = 0;
+    if (nthread < 2) {
+        // single-thread
+        return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix);
+    }
+    auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
+            nrows, n_per_row, imatrix]() {
+        std::array<int64_t, 1 << 4> local_hist = {};
+        const int nrows_per_chunk = chunk_size / n_per_row;
+        size_t local_size = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int first_row = counter; counter += nrows_per_chunk;
+            if (first_row >= nrows) {
+                if (local_size > 0) {
+                    for (int j=0; j<int(local_hist.size()); ++j) {
+                        hist_cur[j] += local_hist[j];
+                    }
+                    new_size += local_size;
+                }
+                break;
+            }
+            lock.unlock();
+            const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
+            local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
+                    first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
+        }
+    };
+    for (int it = 0; it < nthread - 1; ++it) {
+        workers.emplace_back(compute);
+    }
+    compute();
+    for (auto & w : workers) { w.join(); }
+    workers.clear();
+    return new_size;
+}
+
+int32_t llama_merge_models(const struct llama_merge_config * config) {
+#if defined(__linux__) || defined(_WIN32)
+    constexpr bool use_mmap = true;
+#else
+    constexpr bool use_mmap = false;
+#endif
+    // std::move doesn't work with llama_model and llama_model_loader, why?
+    std::vector<std::unique_ptr<llama_model>> models;
+    std::vector<std::unique_ptr<llama_model_loader>> mls;
+    std::vector<no_init<uint8_t>> buf_in;
+    std::vector<no_init<uint8_t>> buf_out;
+    std::set<std::string> ref_names; // list of ref_name per layer
+    std::vector<struct ggml_tensor *> output_tensors;
+
+    // output file
+    struct gguf_context * ctx_out = gguf_init_empty();
+    std::ofstream fout(config->output_path, std::ios::binary);
+    fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+    // remember to call before exit
+    auto clean_up = [&]() {
+        fout.close();
+        for (auto & tensor : output_tensors) {
+            free(tensor);
+        }
+        gguf_free(ctx_out);
+    };
+
+    // load the input models
+    static const size_t n_models = 2;
+    for (size_t i = 0; i < n_models; i++) {
+        auto model = std::unique_ptr<llama_model>(new llama_model());
+        auto ml = std::unique_ptr<llama_model_loader>(new llama_model_loader(config->model_paths[i], use_mmap, NULL));
+        ml->init_mapping(false);
+        llm_load_arch(*ml, *model);
+        llm_load_hparams(*ml, *model);
+
+        models.push_back(std::move(model));
+        mls.push_back(std::move(ml));
+    }
+
+    // for verb copy, we want to get the source tensor
+    auto get_src_tensor_for_copy = [&](const struct llama_merge_inst ins, size_t & i_model) {
+        i_model = std::string(ins.srcs[0]).empty() ? 1 : 0;
+        return mls[i_model]->get_tensor_meta(ins.srcs[i_model]);
+    };
+
+    // construct metadata
+    {
+        // copy the KV pairs from the input file
+        gguf_set_kv(ctx_out, mls[0]->ctx_gguf);
+
+        // correct layer count for output model
+        std::stringstream ss;
+        ss << mls[0]->get_arch_name() << ".block_count";
+        gguf_set_val_u32(ctx_out, ss.str().c_str(), config->n_layers);
+        LLAMA_LOG_INFO("====> Set new value of %s = %ld\n", ss.str().c_str(), config->n_layers);
+
+        // populate metadata for output tensors
+        auto push_tensor = [&](struct ggml_tensor * ref, const char * name) {
+            struct ggml_tensor * out_tensor = (struct ggml_tensor *) malloc(GGML_TENSOR_SIZE);
+            if (ref != nullptr) {
+                // copy metadata (shape, type,...)
+                memcpy(out_tensor, ref, GGML_TENSOR_SIZE);
+            }
+            ggml_set_name(out_tensor, name);
+            gguf_add_tensor(ctx_out, out_tensor);
+            output_tensors.push_back(out_tensor);
+        };
+        for (size_t i = 0; i < config->n_insts; i++) {
+            const struct llama_merge_inst ins = config->insts[i];
+            struct ggml_tensor * t0;
+            struct ggml_tensor * t1;
+            // TODO: reject non-requantize-able type (one that requires imatrix)
+            if (ins.method == LLAMA_MERGE_COPY) {
+                // simply copy from model A
+                size_t i_model;
+                t0 = get_src_tensor_for_copy(ins, i_model);
+                push_tensor(t0, ins.name);
+            } else if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) {
+                t0 = mls[0]->get_tensor_meta(ins.srcs[0]);
+                t1 = mls[1]->get_tensor_meta(ins.srcs[1]);
+                if (llama_format_tensor_shape(t0) != llama_format_tensor_shape(t1)) {
+                    LLAMA_LOG_ERROR("some tensors does not have the same shape");
+                    clean_up();
+                    return -1;
+                }
+                push_tensor(t0, ins.name);
+            } else if (ins.method == LLAMA_MERGE_REPEAT) {
+                // TODO: in theory, we can point 2 tensors to the same offset, but here we're unable to do that, because offset is currently managed by gguf_add_tensor()
+                GGML_ASSERT(false);
+                /*int idx = nullptr;
+                std::string search_tensor(ins.srcs[0]);
+                for (auto & tensor : output_tensors) {
+                    if (std::string(ggml_get_name(tensor)) == search_tensor) {
+                        t0 = tensor;
+                        break;
+                    }
+                }
+                if (t0 == nullptr) {
+                    LLAMA_LOG_ERROR("cannot find source tensor to repeat");
+                    clean_up();
+                    return -1;
+                }
+                push_tensor(t0, ins.name);*/
+            } else {
+                GGML_ASSERT(false); // should never happen
+            }
+        }
+
+        const size_t meta_size = gguf_get_meta_size(ctx_out);
+
+        LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
+
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+    }
+
+    // load tensor data into buffer
+    auto read_tensor_data = [&](struct ggml_tensor * tensor, llama_model_loader & ml, std::vector<no_init<uint8_t>> & buf) -> size_t {
+        if (!ml.use_mmap) {
+            if (buf.size() < ggml_nbytes(tensor)) {
+                buf.resize(ggml_nbytes(tensor));
+            }
+            tensor->data = buf.data();
+        }
+        ml.load_data_for(tensor);
+        return ggml_nbytes(tensor);
+    };
+
+    size_t n_done = 0;
+    auto write_output_tensor = [&](const struct ggml_tensor * tensor, void * data) {
+        // write tensor data + padding
+        const size_t len = ggml_nbytes(tensor);
+        fout.write((const char *) data, len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        n_done++;
+        LLAMA_LOG_INFO("[%4ld/%4ld] %36s - [%s], input type = %6s\n",
+            n_done, output_tensors.size(),
+            ggml_get_name(tensor),
+            llama_format_tensor_shape(tensor).c_str(),
+            ggml_type_name(tensor->type));
+    };
+
+    // TODO: allow user to set n_threads
+    const int n_threads = std::thread::hardware_concurrency();
+    std::vector<std::thread> workers;
+    workers.reserve(n_threads);
+
+    // process instruction one by one
+    GGML_ASSERT(config->n_insts == output_tensors.size());
+    for (size_t i = 0; i < config->n_insts; i++) {
+        const struct llama_merge_inst ins = config->insts[i];
+        struct ggml_tensor * t0;
+        struct ggml_tensor * t1;
+        struct ggml_tensor * out_tensor = output_tensors[i];
+        const size_t n_elements = ggml_nelements(out_tensor);
+        std::vector<no_init<uint8_t>> in_buf0;
+        std::vector<no_init<float>> f32_in_buf0; // dequant it internally
+        std::vector<no_init<uint8_t>> in_buf1;
+        std::vector<no_init<float>> f32_in_buf1; // dequant it internally
+        std::vector<float> f32_out_buf(n_elements, 0.0); // do not resize!
+        std::vector<uint8_t> out_buf(ggml_nbytes(out_tensor)); // do not resize!
+        const int n_per_row = out_tensor->ne[0];
+        const int n_rows = n_elements / n_per_row;
+
+        if (ins.method == LLAMA_MERGE_COPY) {
+            LLAMA_LOG_INFO("copy\n");
+            size_t i_model;
+            t0 = get_src_tensor_for_copy(ins, i_model);
+            read_tensor_data(t0, *mls[i_model], in_buf0);
+            write_output_tensor(out_tensor, t0->data);
+            continue;
+        }
+
+        // dequantize the tensor to FP32
+        auto dequantize = [&](struct ggml_tensor * in_tensor, std::vector<no_init<float>> & f32_in_buf) {
+            if (in_tensor->type != GGML_TYPE_F32) {
+                LLAMA_LOG_INFO("dequant ");
+                llama_convert_tensor_internal(in_tensor, f32_in_buf, workers, n_elements, n_threads);
+            } else {
+                // if we already have f32, just copy it
+                LLAMA_LOG_INFO("f32_copy ");
+                f32_in_buf.resize(n_elements);
+                memcpy((void *) f32_in_buf.data(), in_tensor->data, n_elements * sizeof(float));
+            }
+        };
+
+        // load data and dequantize
+        if (ins.method == LLAMA_MERGE_LINEAR || ins.method == LLAMA_MERGE_SLERP) {
+            t0 = mls[0]->get_tensor_meta(ins.srcs[0]);
+            t1 = mls[1]->get_tensor_meta(ins.srcs[1]);
+            read_tensor_data(t0, *mls[0], in_buf0);
+            read_tensor_data(t1, *mls[1], in_buf1);
+            dequantize(t0, f32_in_buf0);
+            dequantize(t1, f32_in_buf1);
+        }
+
+        if (ins.method == LLAMA_MERGE_LINEAR) {
+            LLAMA_LOG_INFO("linear ");
+            float * in0  = (float *) f32_in_buf0.data();
+            float * in1  = (float *) f32_in_buf1.data();
+            float * dest = (float *) f32_out_buf.data();
+            for (size_t i = 0; i < n_elements; i++) {
+                dest[i] = in0[i] * ins.scales[0] + in1[i] * ins.scales[1];
+            }
+        }
+
+        if (ins.method == LLAMA_MERGE_SLERP) {
+            // Python code: https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
+            LLAMA_LOG_INFO("slerp ");
+            static const float dot_threshold = 0.9995;
+            auto lerp_row = [](float * in0, float * in1, float * out, size_t nelem, float t) {
+                for (size_t i = 0; i < nelem; i++) {
+                    out[i] = in0[i] * (1.0 - t) + in1[i] * t;
+                }
+            };
+            auto slerp_row = [&lerp_row](float * in0, float * in1, float * out, size_t nelem, float t) {
+                float norm0 = std::sqrt(std::inner_product(in0, in0 + nelem, in0, 0.0));
+                float norm1 = std::sqrt(std::inner_product(in1, in1 + nelem, in1, 0.0));
+                // Normalize the vectors to get the directions and angles
+                std::vector<float> v0(nelem);
+                std::vector<float> v1(nelem);
+                for (size_t i = 0; i < nelem; i++) {
+                    v0[i] = in0[i] / norm0;
+                    v1[i] = in1[i] / norm1;
+                }
+                // Dot product with the normalized vectors
+                float dot = std::inner_product(v0.begin(), v0.end(), v1.begin(), 0.0);
+                // If absolute value of dot product is almost 1, vectors are ~colineal, so use lerp
+                if (std::abs(dot) > dot_threshold) {
+                    return lerp_row(in0, in1, out, nelem, t);
+                }
+                // Calculate initial angle between v0 and v1
+                float theta_0 = std::acos(dot);
+                float sin_theta_0 = std::sin(theta_0);
+                // Angle at timestep t
+                float theta_t = theta_0 * t;
+                float sin_theta_t = std::sin(theta_t);
+                // Finish the slerp algorithm
+                float s0 = std::sin(theta_0 - theta_t) / sin_theta_0;
+                float s1 = sin_theta_t / sin_theta_0;
+                for (size_t i = 0; i < nelem; i++) {
+                    out[i] = in0[i] * s0 + in1[i] * s1;
+                }
+            };
+            for (int r = 0; r < n_rows; r++) {
+                float * in0   = (float *) f32_in_buf0.data();
+                float * in1   = (float *) f32_in_buf1.data();
+                float * dest  = (float *) f32_out_buf.data();
+                size_t offset = n_per_row * r;
+                slerp_row(in0 + offset, in1 + offset, dest + offset, n_per_row, ins.t);
+            }
+        }
+
+        // re-quantize it
+        {
+            LLAMA_LOG_INFO("requant\n");
+            std::array<int64_t, 1 << 4> hist_cur = {};
+            static const int min_chunk_size = 32 * 512;
+            const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+            size_t new_size = llama_tensor_quantize_internal(
+                out_tensor->type,
+                f32_out_buf.data(),
+                out_buf.data(),
+                chunk_size,
+                n_rows,
+                n_per_row,
+                hist_cur.data(), // unused for now
+                nullptr,
+                workers,
+                n_threads);
+            GGML_ASSERT(new_size == out_buf.size());
+        }
+
+        LLAMA_LOG_INFO("===> INPUT  %f %f %f\n", f32_in_buf0[0].value, f32_in_buf0[1].value, f32_in_buf0[2].value);
+        LLAMA_LOG_INFO("===> OUTPUT %f %f %f\n", f32_out_buf[0], f32_out_buf[1], f32_out_buf[2]);
+
+        write_output_tensor(out_tensor, out_buf.data());
+    }
+
+    // go back to beginning of file and write the updated meta data
+    {
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *) data.data(), data.size());
+        LLAMA_LOG_INFO("===> Written metadata size = %ld bytes\n", data.size());
+    }
+
+    clean_up();
+    return 0;
+}
+
 static int llama_apply_lora_from_file_internal(
    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
 ) {
@ -13150,6 +13487,18 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
    return nparams;
 }

+int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size) {
+    size_t i = 0;
+    for (const auto & it : model->tensors_by_name) {
+        if (i == arr_size) {
+            break;
+        }
+        name_arr[i] = it.first.c_str();
+        i++;
+    }
+    return model->tensors_by_name.size();
+}
+
 struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
--- a/llama.h
+++ b/llama.h
@ -336,6 +336,33 @@ extern "C" {
        const char * content;
    } llama_chat_message;

+    enum llama_merge_method {
+        LLAMA_MERGE_LINEAR,
+        LLAMA_MERGE_SLERP,
+        LLAMA_MERGE_REPEAT, // doesn't work for now
+        LLAMA_MERGE_COPY,
+    };
+
+    // instruction for merging tensors (model merge)
+    struct llama_merge_inst {
+        char name[GGML_MAX_NAME]; // name of output tensor
+        enum llama_merge_method method;
+        // we only support 2 models for now
+        char srcs[2][GGML_MAX_NAME]; // name of input tensors. if method == copy, only one src is non-empty
+        float scales[2]; // for linear method
+        float t; // for slerp method
+    };
+
+    // merge models
+    struct llama_merge_config {
+        // we only support 2 models for now
+        const char * model_paths[2];
+        const struct llama_merge_inst * insts;
+        const size_t n_insts;
+        const size_t n_layers; // number of output layers
+        const char * output_path;
+    };
+
    // Helpers for getting default parameters
    LLAMA_API struct llama_model_params llama_model_default_params(void);
    LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -415,6 +442,9 @@ extern "C" {
    // Returns the total number of parameters in the model
    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);

+    // Get a list of model tensor name, returns number of elements
+    LLAMA_API int32_t llama_get_all_tensors_name(struct llama_model * model, const char ** name_arr, size_t arr_size);
+
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);

@ -424,6 +454,10 @@ extern "C" {
            const char * fname_out,
            const llama_model_quantize_params * params);

+    // Merge multiple models, inspired by mergekit
+    LLAMA_API int32_t llama_merge_models(
+            const struct llama_merge_config * config);
+
    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for
    // the layers modified by the adapter. Can be NULL to use the current loaded model.