working version

2024-06-11 12:37:05 +02:00 · 2024-06-11 12:37:05 +02:00 · 1a088fb0a5
commit 1a088fb0a5
parent 9e39571fc2
2 changed files with 95 additions and 65 deletions
--- a/examples/control-vector-generator/control-vector-generator.cpp
+++ b/examples/control-vector-generator/control-vector-generator.cpp
@ -70,7 +70,7 @@ struct callback_data {
        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
        ggml_set_name(t_layer, ggml_get_name(t));
-        print_debug_tensor(t_layer);
+        //print_debug_tensor(t_layer);
        if (is_eval_pos) {
            v_pos.push_back(t_layer);
@ -99,7 +99,7 @@ struct callback_data {
    // delete zero rows from a given 2D tensor
    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
-        printf("filter_nonzero_rows\n");
+        //printf("filter_nonzero_rows\n");
        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
            // check if given row containing all zero elements
            int n_cols = t->ne[0]; // hint: should be equal to n_embd
@ -119,7 +119,7 @@ struct callback_data {
        // get "n_nonzero_rows" for the output "diff_filtered"
        int n_nonzero_rows = rows_to_copy.size();
-        printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
        int n_embd = a->ne[0];
        GGML_ASSERT(n_nonzero_rows > 0);
@ -138,7 +138,7 @@ struct callback_data {
            }
        }
-        print_debug_tensor(diff_filtered);
+        //print_debug_tensor(diff_filtered);
        return diff_filtered;
    }
@ -169,7 +169,8 @@ struct train_context {
    // each element of the vector correspond to one layer
    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
-    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [n_embd, m] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    // NOTE (2): v_diff is transposed from v_diff_tmp
    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
@ -196,7 +197,7 @@ struct train_context {
    // add new rows into existing tensor in v_diff_tmp
    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
-        GGML_ASSERT(diff_filtered.size() == n_layers - 1);
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
        for (int il = 0; il < n_layers - 1; il++) {
            auto t = diff_filtered[il];
            auto & diff_tmp = v_diff_tmp[il];
@ -206,32 +207,46 @@ struct train_context {
        }
    }
-    // build the v_diff tensors from v_diff_tmp
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
    void build_v_diff() {
        printf("build_v_diff\n");
        for (int il = 0; il < n_layers - 1; il++) {
            auto & diff_tmp = v_diff_tmp[il];
            int n_elem = diff_tmp.size() / sizeof(float);
            GGML_ASSERT(n_elem % n_embd == 0);
            int n_rows = n_elem / n_embd;
            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
-            // TODO: IMPORTANT!! transpose diff
+            // copy data & transpose
-            diff->data = diff_tmp.data();
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
            float * arr = (float *) diff_tmp.data();
            for (int ir = 0; ir < n_rows; ++ir) {
                for (int ic = 0; ic < n_embd; ++ic) {
                    float f = arr[ir*n_embd + ic];
                    //std::cout << ir << "," << ic << " = " << f << "\n";
                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
                }
            }
            v_diff.push_back(diff);
            print_debug_tensor(diff);
            // free memory of diff_tmp
            diff_tmp.resize(0);
        }
    }
    ~train_context() {
        for (auto ptr : v_final) free(ptr->data);
-        // no need to free v_diff_tmp or v_diff, since we didn't use malloc
+        for (auto ptr : v_diff) free(ptr->data);
        // no need to free v_diff_tmp, since we didn't use malloc
        ggml_free(ctx_ggml);
    }
 };
 struct ctrl_params {
    /* default meta parameters */
    bool always_reload = false;
    int n_completions = 64;
-    int n_threads = 8;
+    int n_pca_batch = 5;
    int n_pca_iterations = 1000;
    /* default filepaths */
    std::string outfile = "control_vector.gguf";
@ -295,9 +310,10 @@ static void print_usage(const char * executable) {
    printf("                              default: 'examples/control-vector-generator/completions.txt'\n");
    printf("  -nc, --num-completions N  number of lines of completions file to use\n");
    printf("                              default: 64\n");
-    printf("  -t,  --num-threads N      number of threads to use (do not confuse with gpt-opts -t)\n");
+    printf("  --batch-pca N             batch size used for PCA\n");
-    printf("                              default: 8\n");
+    printf("                              default: 5\n");
-    printf("       --always-reload      reload the model for every new template to parse (not recommended)\n");
+    printf("  --iter-pca N              number of iterations used for PCA\n");
    printf("                              default: 1000\n");
    printf("\n");
    printf("gpt-opts:\n");
    printf("  other options from main\n");
@ -370,10 +386,10 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params)
                throw std::invalid_argument("error: missing argument for " + arg);
            }
        }
-        if (arg == "--num-threads" || arg == "-t") {
+        if (arg == "--pca-batch") {
            if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) {
                try {
-                    params.n_threads = std::stoi(argv[arg_idx]);
+                    params.n_pca_batch = std::stoi(argv[arg_idx]);
                }
                catch (const std::invalid_argument & ex) {
                    throw std::invalid_argument("error: invalid argument for " + arg);
@ -383,9 +399,18 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params)
                throw std::invalid_argument("error: missing argument for " + arg);
            }
        }
-        if (arg == "--always-reload") {
+        if (arg == "--pca-iter") {
-            params.always_reload = true;
+            if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) {
-            skipme += 1;
+                try {
                    params.n_pca_iterations = std::stoi(argv[arg_idx]);
                }
                catch (const std::invalid_argument & ex) {
                    throw std::invalid_argument("error: invalid argument for " + arg);
                }
                skipme += 2;
            } else {
                throw std::invalid_argument("error: missing argument for " + arg);
            }
        }
        // TODO it might be nice QoL to have single positive/negative args
        // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params
@ -427,7 +452,7 @@ static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool
 static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
    auto * cb_data = (callback_data *) user_data;
-    auto ggml_ne_string = [](const ggml_tensor * t) -> std::string {
+    /*auto ggml_ne_string = [](const ggml_tensor * t) -> std::string {
        std::string str;
        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
            str += std::to_string(t->ne[i]);
@ -436,7 +461,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
            }
        }
        return str;
-    };
+    };*/
    static const char * l_out_name = "l_out";
    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
@ -473,6 +498,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
    for (size_t i = 0; i < v_ctrl.size(); ++i) {
        gguf_add_tensor(ctx, v_ctrl[i]);
        print_debug_tensor(v_ctrl[i]);
        printf("Added tensor: %s\n", v_ctrl[i]->name);
    }
@ -489,7 +515,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
 * Load prompt files and completion file.
 * Then format each pair of prompt + completion to make an entry.
 */
-int prepare_entries(ctrl_params & cparams) {
+static int prepare_entries(ctrl_params & cparams) {
    // load prompts
    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file);
    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file);
@ -511,7 +537,7 @@ int prepare_entries(ctrl_params & cparams) {
        // TODO make this dynamic - allow the user to change it somehow - and adapt based on model
        return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
    };
-    for (int i = 0; i < positive_prompts.size(); ++i) {
+    for (size_t i = 0; i < positive_prompts.size(); ++i) {
        for (auto & cmpl : completions) {
            // TODO replicate the truncations done by the python implementation
            cparams.positive_entries.push_back(format_template(positive_prompts[i], cmpl));
@ -553,7 +579,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    int n_ctx = llama_n_ctx(ctx);
+    // int n_ctx = llama_n_ctx(ctx);
    int n_layers = llama_n_layer(model);
    int n_embd = llama_n_embd(model);
    // get model hint param (a.k.a model arch name)
@ -574,29 +600,13 @@ int main(int argc, char ** argv) {
    // init train_context
    train_context ctx_train(n_embd, n_layers);
    int token_ct = 0;
    for(size_t i = 0; i < cparams.positive_entries.size(); ++i) {
        tokenized_prompt t = tokenized_prompts[i];
        cb_data.n_layers = n_layers;
        cb_data.n_tokens = t.max_seq_len;
-        // need to reload the model so it doesn't run out of context
+        printf("Evaluating prompt[%ld/%ld]: \"%s\" - \"%s\" (%ld tokens)\n", 
-        // this should scale with -c option passed by main
+            i+1, t.tokens_pos.size(),
        token_ct += 2 * t.max_seq_len;
        if (token_ct > n_ctx || cparams.always_reload) {
            //break;
            llama_free(ctx);
            llama_free_model(model);
            std::tie(model, ctx) = llama_init_from_gpt_params(params);
            token_ct = 2 * t.max_seq_len;
        }
        if (token_ct > n_ctx) {
            fprintf(stderr, "context size exceeded on iteration %zu\n", i);
            break;
        }
        printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", 
            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
            t.max_seq_len);
@ -610,12 +620,10 @@ int main(int argc, char ** argv) {
        auto v_diff_filtered = cb_data.calc_diff();
        // save & concat the filtered v_diff to ctx_train
        printf("concat_diff_tmp\n");
        ctx_train.concat_diff_tmp(v_diff_filtered);
        // reset for next iteration
        cb_data.reset();
        printf("reset\n");
    }
    // done with the model, we can now free it to make gain some memory
@ -628,8 +636,10 @@ int main(int argc, char ** argv) {
    // run PCA
    PCA::pca_params pca_params;
    pca_params.n_threads = params.n_threads;
    pca_params.n_batch = cparams.n_pca_batch;
    pca_params.n_iterations = cparams.n_pca_iterations;
    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    exit(0); // TODO: REMOVE ME !!!!!!!!!!!!!!!!!!!!!!!!
    // write output vectors to gguf
    export_gguf(ctx_train.v_final, cparams.outfile, model_hint);
--- a/examples/control-vector-generator/pca.hpp
+++ b/examples/control-vector-generator/pca.hpp
@ -38,10 +38,15 @@ struct pca_params {
    int n_batch = 5; // number of iterations do to in one batch. larger the batch, more memory is used
    int n_iterations = 1000;
    float tolerance = 1e-7;
    // for debugging
    int i_layer = 0;
    int n_layers = 0;
 };
 // result from each iteration
 struct pca_result {
    struct ggml_tensor * calculated_square = NULL;
    std::vector<struct ggml_tensor *> eigenvectors;
    std::vector<float> distances;
 };
@ -162,7 +167,6 @@ static struct ggml_cgraph * build_graph_piter(
    // turn v_diff_original into square matrix if needed
    struct ggml_tensor * tmp_square;
    if (calc_square) {
        print_debug_tensor(model.dev_input);
        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
        ggml_set_name(tmp_square, "tmp_square");
    }
@ -229,17 +233,17 @@ static ggml_status compute_piter(
            }
            return i;
        };
-        // get output nodes
+        result.calculated_square = NULL;
        result.eigenvectors.clear();
        result.distances.clear();
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
        for (int i = 0; i < gf->n_nodes; ++i) {
            auto node = gf->nodes[i];
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
                print_debug_tensor(node, false);
                result.eigenvectors[iter] = node;
            }
            // find distances, then copy data from device
@ -247,7 +251,11 @@ static ggml_status compute_piter(
                float d;
                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
                result.distances[iter] = d;
-                std::cout << node->name << " = " << d << "\n";
+                // std::cout << node->name << " = " << d << "\n";
            }
            // find tmp_square if it exists (without copying data from device)
            if (std::string(node->name) == "tmp_square") {
                result.calculated_square = node;
            }
        }
    }
@ -258,23 +266,22 @@ static void power_iteration(
        const struct pca_params & params,
        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
        struct ggml_tensor * output) {
-    printf("in power iteration\n");
+    //printf("in power iteration\n");
    //int n_embd = input->ne[1];
    struct pca_model model(input);
    ggml_gallocr_t allocr = NULL;
    struct pca_result result;
-    struct ggml_tensor * last_eigenvector;
+    struct ggml_tensor * last_eigenvector = NULL;
-    int n_iter = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
-    for (int iter = 0; iter < n_iter; ++iter) {
+    for (int iter = 0; iter < n_iters; ++iter) {
        bool calc_square = (iter == 0); // only need to calculate square for first iteration
        if (allocr) {
            ggml_gallocr_free(allocr);
        }
        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
-        ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
        compute_piter(params, model, gf, allocr, result);
        for (size_t k = 0; k < result.distances.size(); ++k) {
@ -284,30 +291,43 @@ static void power_iteration(
            }
        }
-        break; // FIXME
+        if (calc_square) {
            // copy and store the square matrix if needed
            GGML_ASSERT(result.calculated_square != NULL);
            std::vector<uint8_t> tmp_buf(ggml_nbytes(model.dev_square));
            ggml_backend_tensor_get(result.calculated_square, tmp_buf.data(), 0, tmp_buf.size());
            ggml_backend_tensor_set(model.dev_square, tmp_buf.data(), 0, tmp_buf.size());
        }
        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
    }
    // get output tensor
    GGML_ASSERT(last_eigenvector);
    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
-    print_debug_tensor(output);
+    //print_debug_tensor(output);
    ggml_gallocr_free(allocr);
 }
 static void run_pca(
-        const struct pca_params & params,
+        struct pca_params & params,
-        const std::vector<struct ggml_tensor *> & v_input,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
        const std::vector<struct ggml_tensor *> & v_output) {
    printf("Running PCA...\n");
    int n_embd = v_input[0]->ne[0]; // shape of v_input[0]: [n_embd, m]
    for (size_t il = 0; il < v_input.size(); ++il) {
-        print_debug_tensor(v_input[il]);
+
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
        auto name = std::string("direction.") + std::to_string(il + 1);
        ggml_set_name(ctrl_out, name.c_str());
        // run power_iteration
        params.i_layer = il;
        params.n_layers = v_input.size();
        power_iteration(params, v_input[il], ctrl_out);
-        printf("Done with layer %d\n", il);
+        printf("DONE layer %ld / %ld\n", il+1, v_input.size());
-        print_debug_tensor(ctrl_out);
+        //print_debug_tensor(ctrl_out);
    }
    printf("Done with PCA.\n");
 }