From 447023fc43ddb6c86c10035ec3442b960fbc5bbe Mon Sep 17 00:00:00 2001 From: ngxson Date: Thu, 30 May 2024 23:58:32 +0200 Subject: [PATCH] add multi prompts, multi-thread for PCA --- .../control-vector-generator.cpp | 248 +++++++++++------- 1 file changed, 157 insertions(+), 91 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 32c200238..f36c1f6a2 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -20,13 +20,23 @@ struct callback_data { std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] std::vector v_diff; // vector of matrices of size [n_embd, n_tokens] std::vector v_final; // vector of finished vectors of size [n_embd] + ~callback_data() { + for (auto ptr : v_pos) free(ptr); + for (auto ptr : v_neg) free(ptr); + for (auto ptr : v_diff) free(ptr); + for (auto ptr : v_final) free(ptr); + } }; struct ctrl_params { std::string outfile = "control_vector.gguf"; std::string completions_file = "examples/control-vector-generator/completions.txt"; - std::string positive = "happy"; // TODO support multiple positive prompts - std::string negative = "sad"; // TODO support multiple negative prompts + /* pair of prompts to be used for generating the vectors */ + std::string positive_prompts_file = "positive.txt"; + std::string negative_prompts_file = "negative.txt"; + std::vector positive_prompts; + std::vector negative_prompts; + /* pair of prompts to be used for testing */ std::vector positive_entries; std::vector negative_entries; }; @@ -38,11 +48,11 @@ static void print_usage(const char * executable) { printf("Creates a GGUF control vector for a given model."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --outfile output file (default: 'control_vector.gguf')\n"); - printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); - printf(" --positive positive prompt (default: 'happy')\n"); - printf(" --negative negative prompt (default: 'sad')\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --outfile output file (default: 'control_vector.gguf')\n"); + printf(" --completions-file completions file (default: 'examples/control-vector-generator/completions.txt')\n"); + printf(" -pf, --positive-file positive prompts file, one prompt per line (default: 'positive.txt')\n"); + printf(" -nf, --negative-file negative prompts file, one prompt per line (default: 'negative.txt')\n"); printf("\n"); printf("gpt-opts: other options from main\n"); printf("\n"); @@ -74,8 +84,7 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.outfile = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } @@ -84,28 +93,25 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.completions_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--positive") { + if (arg == "--positive-file" || arg == "-pf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.positive = argv[arg_idx]; + params.positive_prompts_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } - if (arg == "--negative") { + if (arg == "--negative-file" || arg == "-nf") { if (++arg_idx < argc && strncmp(argv[arg_idx], arg_prefix.c_str(), 2) != 0) { - params.negative = argv[arg_idx]; + params.negative_prompts_file = argv[arg_idx]; // FIXME hack to skip these args in gpt_parse_params skipme += 2; - } - else { + } else { throw std::invalid_argument("error: missing argument for " + arg); } } @@ -128,6 +134,22 @@ static int ctrlvec_params_parse(int argc, char ** argv, ctrl_params & params) { return skipme; } +static std::vector ctrlvec_load_prompt_file(std::string path) { + std::vector output; + std::ifstream file(path); + if (!file.is_open()) { + throw std::runtime_error("Unable to open file " + path); + } + std::string line; + while (std::getline(file, line)) { + if (!line.empty()) { // skip empty lines + output.push_back(line); + } + } + file.close(); + return output; +} + static std::string format_template(std::string persona, std::string suffix) { const std::string user_tag = "[INST]"; const std::string asst_tag = "[/INST]"; @@ -135,7 +157,7 @@ static std::string format_template(std::string persona, std::string suffix) { return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; } -static void populate_entries(ctrl_params & cparams) { +/*static void populate_entries(ctrl_params & cparams) { std::string line; std::ifstream completions_file(cparams.completions_file); if (completions_file.is_open()) { @@ -145,11 +167,10 @@ static void populate_entries(ctrl_params & cparams) { cparams.negative_entries.push_back(format_template(cparams.negative, line)); } completions_file.close(); - } - else { + } else { throw std::invalid_argument("error: invalid completions file or file could not be opened"); } -} // TODO actually do something with this +}*/ // TODO actually do something with this static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; @@ -236,7 +257,7 @@ static void calc_diff(callback_data & cb_data) { for (size_t il = 0; il < cb_data.v_pos.size(); il++) { auto & inp_pos = cb_data.v_pos[il]; auto & inp_neg = cb_data.v_neg[il]; - float * dest = (float *) malloc(n_elems * sizeof(float *)); + float * dest = (float *) malloc(n_elems * sizeof(float)); for (size_t i = 0; i < n_elems; i++) { dest[i] = inp_pos[i] - inp_neg[i]; } @@ -323,13 +344,23 @@ static std::vector power_iteration(callback_data & cb_data, const float * // TODO translate to ggml static void pca(callback_data & cb_data) { - for (int i = 0; i < cb_data.v_diff.size(); i++) { - float* matrix = square_diff(cb_data, i); - std::vector eigenvector = power_iteration(cb_data, matrix); - cb_data.v_final.push_back(&eigenvector[0]); - delete[] matrix; - printf("Done with layer %d\n", i); + size_t n_threads = 8; + int n_layers = cb_data.v_diff.size(); + std::vector threads; + cb_data.v_final.reserve(n_layers); + auto worker_function = [&](int worker_id) { + for (int il = worker_id; il < n_layers; il += n_threads) { + float * matrix = square_diff(cb_data, il); + std::vector eigenvector = power_iteration(cb_data, matrix); + cb_data.v_final[il] = &eigenvector[0]; + delete[] matrix; + printf("Done with layer %d\n", il); + } + }; + for (int i = 0; i < n_threads; ++i) { + threads.emplace_back(worker_function, i); } + for (auto & th : threads) th.join(); printf("Done with PCA."); } @@ -340,32 +371,29 @@ static std::string to_string(const T & val) { return ss.str(); } -static void export_gguf(callback_data & cb_data, const std::string fname, const std::string model_hint) { +static void export_gguf(std::vector v_final, int n_embd, const std::string fname, const std::string model_hint) { struct gguf_context * ctx = gguf_init_empty(); const std::string arch = "controlvector"; gguf_set_val_str(ctx, "general.architecture", arch.c_str()); gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); - gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), cb_data.v_final.size()); - - //size_t buf_size = 3u*cb_data.n_embd*sizeof(float); // TODO how much size do i need? - size_t buf_size = 128u*1024u*4096u; // FIXME placehokder + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_final.size()); // TODO customize mem size - I have no idea what this is supposed to be struct ggml_init_params params = { - /*.mem_size =*/ buf_size, + /*.mem_size =*/ ggml_tensor_overhead() * v_final.size(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ false, }; struct ggml_context * ctx_data = ggml_init(params); - for (int i = 0; i < cb_data.v_final.size(); ++i) { + for (size_t i = 0; i < v_final.size(); ++i) { // TODO this number is probably not right - figure out which layer is which // the python implementation uses a dict to handle this, we don't know if it's 1, 2, 3, 4... or other const std::string name = "direction." + to_string(i+1); - struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, cb_data.n_embd); + struct ggml_tensor * cur = ggml_new_tensor_1d(ctx_data, GGML_TYPE_F32, n_embd); ggml_set_name(cur, name.c_str()); @@ -374,7 +402,7 @@ static void export_gguf(callback_data & cb_data, const std::string fname, const { float * data = (float *) cur->data; for(int j = 0; j < ggml_nelements(cur); j++) { - data[j] = cb_data.v_final[i][j]; + data[j] = v_final[i][j]; } } @@ -403,78 +431,116 @@ int main(int argc, char ** argv) { argc -= skipme; argv += skipme; - callback_data cb_data; gpt_params params; if (!gpt_params_parse(argc, argv, params)) { return 1; } + // load prompts + cparams.positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); + cparams.negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); + if (cparams.positive_prompts.size() != cparams.negative_prompts.size()) { + fprintf(stderr, "number of positive and negative prompts must be equal"); + return 1; + } + print_build_info(); llama_backend_init(); llama_numa_init(params.numa); - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = cb_eval; - params.cb_eval_user_data = &cb_data; - params.warmup = false; - - // init + // load the model to get hparams llama_model * model; llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; + int n_layers = llama_n_layer(model); + int n_embd = llama_n_embd(model); + int n_prompts = cparams.positive_prompts.size(); + // vector of finished vectors of size [n_embd], we have (n_layers - 1) vectors in total + std::vector v_final(n_layers - 1, NULL); + for (size_t i = 0; i < v_final.size(); ++i) { + v_final[i] = (float *) calloc(n_embd, sizeof(float)); + } + llama_free(ctx); + llama_free_model(model); + + for (size_t i = 0; i < n_prompts; ++i) { + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // load model + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + /* TODO this just tokenizes the exact pos/neg strings, correct? + * instead we want to create a bunch of starter prompts for it to work off + * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors + * see the blogpost + python implementation for reference + * + * https://vgel.me/posts/representation-engineering/ + * https://github.com/vgel/repeng/blob/main/repeng/extract.py + */ + std::string positive_prompt = cparams.positive_prompts[i]; + std::string negative_prompt = cparams.negative_prompts[i]; + std::vector tokens_pos = ::llama_tokenize(ctx, positive_prompt, add_bos); + std::vector tokens_neg = ::llama_tokenize(ctx, negative_prompt, add_bos); + size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + cb_data.n_tokens = max_seq_len; + cb_data.n_embd = n_embd; + + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); + + cb_data.is_eval_pos = true; + get_hidden_layers(ctx, tokens_pos); + cb_data.is_eval_pos = false; + get_hidden_layers(ctx, tokens_neg); + + printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); + printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); + + calc_diff(cb_data); + printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); + + printf("Running PCA...\n"); + pca(cb_data); + + // add the output vector to v_final + for (size_t j = 0; j < cb_data.v_final.size(); ++j) { + for (size_t k = 0; k < n_embd; ++k) { + v_final[j][k] += cb_data.v_final[j][k]; + } + } + + llama_free(ctx); + llama_free_model(model); } - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + // calculate the mean value of v_final + // TODO: maybe using LERP here + for (size_t j = 0; j < v_final.size(); ++j) { + for (size_t k = 0; k < n_embd; ++k) { + v_final[j][k] /= n_prompts; + } } - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - - /* TODO this just tokenizes the exact pos/neg strings, correct? - * instead we want to create a bunch of starter prompts for it to work off - * we need to run get_hidden_layers many many times and then figure out how to combine the resulting vectors - * see the blogpost + python implementation for reference - * - * https://vgel.me/posts/representation-engineering/ - * https://github.com/vgel/repeng/blob/main/repeng/extract.py - */ - std::vector tokens_pos = ::llama_tokenize(ctx, cparams.positive, add_bos); - std::vector tokens_neg = ::llama_tokenize(ctx, cparams.negative, add_bos); - size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); - padding_seq(ctx, tokens_pos, max_seq_len); - padding_seq(ctx, tokens_neg, max_seq_len); - cb_data.n_tokens = max_seq_len; - cb_data.n_embd = llama_n_embd(model); - - cb_data.is_eval_pos = true; - get_hidden_layers(ctx, tokens_pos); - cb_data.is_eval_pos = false; - get_hidden_layers(ctx, tokens_neg); - - printf("%f %f \n", cb_data.v_pos[0][4096], cb_data.v_pos[0][4096]); - printf("%f %f \n", cb_data.v_neg[0][4096], cb_data.v_neg[0][4096]); - - calc_diff(cb_data); - printf("%f %f \n", cb_data.v_diff[0][4096], cb_data.v_diff[0][4096]); - - pca(cb_data); - // TODO figure out how to extract this from model - there's no API exposed to get model arch string // we need get_arch_name() from llama.cpp // TODO also has support been implemeneted for arches other than llama yet? see #5970 std::string model_hint = "llama"; - export_gguf(cb_data, cparams.outfile, model_hint); - - //llama_print_timings(ctx); - - llama_free(ctx); - llama_free_model(model); + export_gguf(v_final, n_embd, cparams.outfile, model_hint); llama_backend_free();