first working version

2024-02-26 22:31:25 +01:00 · 2024-02-26 22:31:25 +01:00 · df9fb7e7bf
commit df9fb7e7bf
parent 48582575ab
4 changed files with 121 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@ -72,6 +72,7 @@ models-mnt
 /train-text-from-scratch
 /tokenize
 /vdot
+/merge
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json
--- a/examples/merge/merge.cpp
+++ b/examples/merge/merge.cpp
@ -12,32 +12,107 @@


 // usage:
-//  ./merge ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2
+//  ./merge ./path/model_1 CONFIG1 ./path/model_2 CONFIG2
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s ./path/model_1 LAYERS_1 ./path/model_2 LAYERS_2\n\n", executable);
-    printf("  LAYERS must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
-    //printf("  Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1\n");
-    printf("  The embedding layer of the first model will be used");
+    printf("usage: %s ./path/model_1 CONFIG1 ./path/model_2 CONFIG2\n\n", executable);
+    printf("  CONFIG must be in format: p0-p1,p2-p3,p4,... Example: 0-5,7,8-12\n");
+    printf("  Optionally, you can specify the scaling for a range of layers, for example: 0-5*0.5,6-7*1. By default, scale will be 0.5. The number of layer start counting from 0.\n");
+    printf("  The embedding layer of the first model will be used\n");
+    printf("  NOTE: currently, only F16 model type is supported\n");
    exit(1);
 }

+inline std::vector<std::string> str_split(std::string str, const std::string & delimiter) {
+    size_t pos = 0;
+    std::string token;
+    std::vector<std::string> output;
+    while ((pos = str.find(delimiter)) != std::string::npos) {
+        token = str.substr(0, pos);
+        output.push_back(token);
+        str.erase(0, pos + delimiter.length());
+    }
+    output.push_back(str); // the rest
+    return output;
+}
+
+static std::vector<struct llama_merge_config> parse_config(std::string & input) {
+    std::vector<struct llama_merge_config> configs;
+    auto intervals = str_split(input, ",");
+    for (auto & interval : intervals) {
+        auto components = str_split(interval, "*");
+        if (components.empty()) {
+            throw std::runtime_error("Config is incorrect");
+        }
+        float scale = components.size() == 2
+            ? std::stof(components[1])
+            : 0.5; // be default
+        auto p0p1 = str_split(components[0], "-");
+        if (p0p1.empty()) {
+            throw std::runtime_error("Layer interval is invalid");
+        }
+        int p0 = std::stoi(p0p1[0]);
+        int p1 = p0p1.size() == 2 ? std::stoi(p0p1[1]) : p0;
+        if (p0 > p1) {
+            throw std::runtime_error("Layer interval is invalid, the end layer number is bigger and start layer number (p0 > p1)");
+        }
+        for (int i = p0; i <= p1; i++) {
+            struct llama_merge_config conf{i, scale, scale};
+            configs.push_back(conf);
+        }
+        // TODO: maybe check for overlap intervals?
+    }
+    return configs;
+}
+
 int main(int argc, char ** argv) {
    llama_backend_init();
-    llama_model_params model_params = llama_model_default_params();
-    std::vector<struct llama_merge_config> configs;
-    for (int i = 0; i < 100; i++) {
-        struct llama_merge_config conf{i, 0.0, 0.0};
-        configs.push_back(conf);
+
+    if (argc < 6) {
+        usage(argv[0]);
    }
+
+    std::string fname_model1(argv[1]);
+    std::string config_model1(argv[2]);
+    std::string fname_model2(argv[3]);
+    std::string config_model2(argv[4]);
+    std::string fname_output(argv[5]);
+
+    // TODO: add try catch
+    auto configs1 = parse_config(config_model1);
+    auto configs2 = parse_config(config_model2);
+    std::vector<struct llama_merge_config> configs;
+
+    if (configs1.size() != configs2.size()) {
+        fprintf(stderr, "Number of layers between 2 configs does not match, config1 has %ld layers and config2 has %ld layers\n", configs1.size(), configs2.size());
+    }
+
+    // merge 2 configs
+    printf("Merge configs:\n");
+    for (auto c1 : configs1) {
+        float scale2 = -1;
+        for (auto c2 : configs2) {
+            if (c2.i_layer == c1.i_layer) {
+                scale2 = c2.scale2;
+            }
+        }
+        if (scale2 < 0) {
+            fprintf(stderr, "Cannot find config for layer %d in CONFIG2\n", c1.i_layer);
+            exit(1);
+        }
+        struct llama_merge_config conf{c1.i_layer, c1.scale1, scale2};
+        configs.push_back(conf);
+
+        printf("  Layer %d: scale1 = %f, scale2 = %f\n", conf.i_layer, conf.scale1, conf.scale2);
+    }
+
    llama_merge_models(
-        "",
-        "",
+        fname_model1.c_str(),
+        fname_model2.c_str(),
        configs.data(),
-        100,
-        "/tmp/dolphin-test-merge.gguf"
+        configs.size(),
+        fname_output.c_str()
    );
-    std::cout << "done\n";
    llama_backend_free();
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -11126,13 +11126,6 @@ static int32_t llama_merge_models_internal(

    // process layers
    for (int i = 0; i < ml1.n_tensors; ++i) {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ 1000u*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx_ggml = ggml_init(params);
-
        struct ggml_tensor * tensor1 = ml1.get_tensor_meta(i);
        std::vector<no_init<uint8_t>> buf1;
        const std::string name = ggml_get_name(tensor1);
@ -11142,54 +11135,54 @@ static int32_t llama_merge_models_internal(
        std::vector<no_init<uint8_t>> buf2;
        struct ggml_tensor * tensor2 = ml2.get_tensor_meta(idx_ml2);

-        struct ggml_tensor * result;
+        // GGML_TYPE_F16
+        std::vector<no_init<uint8_t>> result(tensor_size);

        if (llama_format_tensor_shape(tensor1) != llama_format_tensor_shape(tensor2)) {
            LLAMA_LOG_ERROR("Tensor shapes are different\n");
+            return -1;
        }

-        int i_layer;
+        int i_layer = -1;
        if (sscanf(name.c_str(), "blk.%d.", &i_layer) != 1) {
            // non-layer, simply copy
            read_tensor_data(tensor1, ml1, buf1);
-            result = tensor1; // no change
+            memcpy(result.data(), tensor1->data, tensor_size);
        } else {
-            LLAMA_LOG_INFO("i_layer %d\n", i_layer);
+            auto conf = get_config_for_layer(i_layer);
            read_tensor_data(tensor1, ml1, buf1);
            read_tensor_data(tensor2, ml2, buf2);
-            auto conf = get_config_for_layer(i_layer);
-            struct ggml_cgraph * gf = ggml_new_graph(ctx_ggml);
-            struct ggml_tensor * t1 = ggml_dup_tensor(ctx_ggml, tensor1);
-            struct ggml_tensor * t2 = ggml_dup_tensor(ctx_ggml, tensor2);
-            t1 = ggml_cpy(ctx_ggml, tensor1, t1);
-            t2 = ggml_cpy(ctx_ggml, tensor2, t2);
-            t1 = ggml_scale(ctx_ggml, t1, conf->scale1);
-            t2 = ggml_scale(ctx_ggml, t2, conf->scale2);
-            result = ggml_add(ctx_ggml, t1, t2);
-            ggml_build_forward_expand(gf, result);
-            ggml_graph_dump_dot(gf, NULL, "/tmp/___cgraph.txt");
-            ggml_graph_compute_with_ctx(ctx_ggml, gf, 1);
-        }
+            LLAMA_LOG_INFO("Merge layer %d with scale1 = %f, scale2 = %f\n", i_layer, conf->scale1, conf->scale2);

-        LLAMA_LOG_INFO("i_layer %d ===\n", i_layer);
+            if (tensor1->type == GGML_TYPE_F16 && tensor2->type == GGML_TYPE_F16) {
+                for (size_t i = 0; i < result.size() / sizeof(float); i++) {
+                    float * t1 = (float *) tensor1->data;
+                    float * t2 = (float *) tensor2->data;
+                    float * dest = (float *) result.data();
+                    dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
+                }
+            } else if (tensor1->type == GGML_TYPE_F32 && tensor2->type == GGML_TYPE_F32) {
+                for (size_t i = 0; i < result.size() / sizeof(double); i++) {
+                    double * t1 = (double *) tensor1->data;
+                    double * t2 = (double *) tensor2->data;
+                    double * dest = (double *) result.data();
+                    dest[i] = t1[i] * conf->scale1 + t2[i] * conf->scale2;
+                }
+            } else {
+                LLAMA_LOG_ERROR("Only GGML_TYPE_F16 or GGML_TYPE_F32 is supported, current type = %s\n", ggml_type_name(tensor1->type));
+                return -1;
+            }
+        }

        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s\n",
               i + 1, ml1.n_tensors,
-               ggml_get_name(result),
-               llama_format_tensor_shape(result).c_str(),
-               ggml_type_name(result->type));
-
-        //std::vector<no_init<uint8_t>> tensor_data(tensor_size);
-        //ggml_backend_tensor_get(tensor1, tensor_data.data(), 0, tensor_size);
+               ggml_get_name(tensor1),
+               llama_format_tensor_shape(tensor1).c_str(),
+               ggml_type_name(tensor1->type));

        // write tensor data + padding
-        const char * buf = (const char *) result->data;
-        printf("%d %d\n", buf[0], buf[1]);
-        fout.write((const char *) result->data, tensor_size);
+        fout.write((const char *) result.data(), tensor_size);
        zeros(fout, GGML_PAD(tensor_size, GGUF_DEFAULT_ALIGNMENT) - tensor_size);
-        ggml_free(ctx_ggml);
-
-        if (i > 3) break;
    }

    // go back to beginning of file and write the updated meta data
--- a/BIN
+++ b/BIN