first working version

2024-02-26 22:31:25 +01:00 · 2024-02-26 22:31:25 +01:00 · df9fb7e7bf
commit df9fb7e7bf
parent 48582575ab
4 changed files with 121 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@ -72,6 +72,7 @@ models-mnt
 /train-text-from-scratch
 /tokenize
 /vdot
 /merge
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json
--- a/examples/merge/merge.cpp
+++ b/examples/merge/merge.cpp
@ -12,32 +12,107 @@
 // usage:
-//  ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2
+//  ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable);
+    printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable);
-    printf("  LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
+    printf("  CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
-    //printf("  Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n");
+    printf("  Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n");
-    printf("  The embedding layer of the first model will be used");
+    printf("  The embedding layer of the first model will be used\n");
    printf("  NOTE: currently, only F16 model type is supported\n");
    exit(1);
 }
 inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
    size_t pos = 0;
    std::string token;
    std::vector<std::string> output;
    while ((pos = str.find(delimiter)) != std::string::npos) {
        token = str.substr(0, pos);
        output.push_back(token);
        str.erase(0, pos + delimiter.length());
    }
    output.push_back(str); // the rest
    return output;
 }
 static std::vector<struct llama_merge_config> parse_config(std::string & input) {
    std::vector<struct llama_merge_config> configs;
    auto intervals = str_split(input, ",");
    for (auto & interval : intervals) {
        auto components = str_split(interval, "*");
        if (components.empty()) {
            throw std::runtime_error("Config is incorrect");
        }
        float scale = components.size() == 2
            ? std::stof(components[1])
            : 0.5; // be default
        auto p0p1 = str_split(components[0], "-");
        if (p0p1.empty()) {
            throw std::runtime_error("Layer interval is invalid");
        }
        int p0 = std::stoi(p0p1[0]);
        int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0;
        if (p0 > p1) {
            throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)");
        }
        for (int i = p0; i <= p1; i++) {
            struct llama_merge_config conf{i, scale, scale};
            configs.push_back(conf);
        }
        // TODO: maybe check for overlap intervals?
    }
    return configs;
 }
 int main(int argc, char ** argv) {
    llama_backend_init();
-    llama_model_params model_params = llama_model_default_params();
+
-    std::vector<struct llama_merge_config> configs;
+    if (argc < 6) {
-    for (int i = 0; i < 100; i++) {
+        usage(argv[0]);
        struct llama_merge_config conf{i, 0.0, 0.0};
        configs.push_back(conf);
    }
    std::string fname_model1(argv[1]);
    std::string config_model1(argv[2]);
    std::string fname_model2(argv[3]);
    std::string config_model2(argv[4]);
    std::string fname_output(argv[5]);
    // TODO: add try catch
    auto configs1 = parse_config(config_model1);
    auto configs2 = parse_config(config_model2);
    std::vector<struct llama_merge_config> configs;
    if (configs1.size() != configs2.size()) {
        fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size());
    }
    // merge 2 configs
    printf("Merge configs:\n");
    for (auto c1 : configs1) {
        float scale2 = -1;
        for (auto c2 : configs2) {
            if (c2.i_layer == c1.i_layer) {
                scale2 = c2.scale2;
            }
        }
        if (scale2 < 0) {
            fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer);
            exit(1);
        }
        struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2};
        configs.push_back(conf);
        printf("  Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2);
    }
    llama_merge_models(
-        "",
+        fname_model1.c_str(),
-        "",
+        fname_model2.c_str(),
        configs.data(),
-        100,
+        configs.size(),
-        "/tmp/dolphin-test-merge.gguf"
+        fname_output.c_str()
    );
    std::cout << "done\n";
    llama_backend_free();
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal(
    // process layers
    for (int i = 0; i < ml1.n_tensors; ++i) {
        struct ggml_init_params params = {
            /*.mem_size   =*/ 1000u*ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx_ggml = ggml_init(params);
        struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
        std::vector<no_init<uint8_t>> buf1;
        const std::string name = ggml_get_name(tensor1);
@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal(
        std::vector<no_init<uint8_t>> buf2;
        struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);
-        struct ggml_tensor * result;
+        // GGML_TYPE_F16
        std::vector<no_init<uint8_t>> result(tensor_size);
        if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
            LLAMA_LOG_ERROR("Tensor shapes are different\n");
            return -1;
        }
-        int i_layer;
+        int i_layer = -1;
        if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
            // non-layer, simply copy
            read_tensor_data(tensor1, ml1, buf1);
-            result = tensor1; // no change
+            memcpy(result.data(), tensor1->data, tensor_size);
        } else {
-            LLAMA_LOG_INFO("i_layer %d\n", i_layer);
+            auto conf = get_config_for_layer(i_layer);
            read_tensor_data(tensor1, ml1, buf1);
            read_tensor_data(tensor2, ml2, buf2);
-            auto conf = get_config_for_layer(i_layer);
+            LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2);
            struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml);
            struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1);
            struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2);
            t1 = ggml_cpy(ctx_ggml, tensor1, t1);
            t2 = ggml_cpy(ctx_ggml, tensor2, t2);
            t1 = ggml_scale(ctx_ggml, t1, conf->scale1);
            t2 = ggml_scale(ctx_ggml, t2, conf->scale2);
            result = ggml_add(ctx_ggml, t1, t2);
            ggml_build_forward_expand(gf, result);
            ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt");
            ggml_graph_compute_with_ctx(ctx_ggml, gf, 1);
        }
-        LLAMA_LOG_INFO("i_layer %d ===\n", i_layer);
+            if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) {
                for (size_t i = 0; i < result.size() / sizeof(float); i++) {
                    float * t1 = (float *) tensor1->data;
                    float * t2 = (float *) tensor2->data;
                    float * dest = (float *) result.data();
                    dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
                }
            } else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) {
                for (size_t i = 0; i < result.size() / sizeof(double); i++) {
                    double * t1 = (double *) tensor1->data;
                    double * t2 = (double *) tensor2->data;
                    double * dest = (double *) result.data();
                    dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
                }
            } else {
                LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type));
                return -1;
            }
        }
        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
               i + 1, ml1.n_tensors,
-               ggml_get_name(result),
+               ggml_get_name(tensor1),
-               llama_format_tensor_shape(result).c_str(),
+               llama_format_tensor_shape(tensor1).c_str(),
-               ggml_type_name(result->type));
+               ggml_type_name(tensor1->type));
        //std::vector<no_init<uint8_t>> tensor_data(tensor_size);
        //ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size);
        // write tensor data + padding
-        const char * buf = (const char *) result->data;
+        fout.write((const char *) result.data(), tensor_size);
        printf("%d %d\n", buf[0], buf[1]);
        fout.write((const char *) result->data, tensor_size);
        zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
        ggml_free(ctx_ggml);
        if (i > 3) break;
    }
    // go back to beginning of file and write the updated meta data
--- a/BIN
+++ b/BIN