From 3815a0c3069fb8bb80c7d5913fb213c2ac62c98d Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Mon, 3 Jun 2024 21:26:13 -0400 Subject: [PATCH] pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped --- .../control-vector-generator.cpp | 65 ++++++++++++------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 59335f826..0135dfb18 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -91,6 +91,14 @@ struct ctrl_params { std::vector negative_entries; }; +struct tokenized_prompt { + std::string positive; + std::string negative; + std::vector tokens_pos; + std::vector tokens_neg; + size_t max_seq_len; +}; + template static std::string to_string(const T & val) { std::stringstream ss; @@ -713,11 +721,11 @@ int main(int argc, char ** argv) { cparams.positive_prompts = ctrlvec_load_prompt_file(cparams.positive_prompts_file); cparams.negative_prompts = ctrlvec_load_prompt_file(cparams.negative_prompts_file); if (cparams.positive_prompts.size() != cparams.negative_prompts.size()) { - fprintf(stderr, "number of positive and negative prompts must be equal"); + fprintf(stderr, "number of positive and negative prompts must be equal\n"); return 1; } if (cparams.positive_prompts.empty()) { - fprintf(stderr, "must provide at least one prompt pair"); + fprintf(stderr, "must provide at least one prompt pair\n"); return 1; } @@ -751,6 +759,29 @@ int main(int argc, char ** argv) { }; cb_data.ctx_ggml = ggml_init(params_ggml); + // create templated prompts + for (int i = 0; i < n_prompts; ++i) { + populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); + } + + // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped + std::vector tokenized_prompts; + size_t n_total_tokens = 0; + for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { + tokenized_prompt t; + t.positive = cparams.positive_entries[i]; + t.negative = cparams.negative_entries[i]; + t.tokens_pos = ::llama_tokenize(ctx, t.positive, false); + t.tokens_neg = ::llama_tokenize(ctx, t.negative, false); + t.max_seq_len = std::max(t.tokens_pos.size(), t.tokens_neg.size()); + padding_seq(ctx, t.tokens_pos, t.max_seq_len); + padding_seq(ctx, t.tokens_neg, t.max_seq_len); + n_total_tokens += 2 * t.max_seq_len; + tokenized_prompts.push_back(t); + } + + std::cout << "n_total_tokens: " << n_total_tokens << std::endl; + // init diff_ctx diff_ctx dctx; @@ -759,7 +790,7 @@ int main(int argc, char ** argv) { // we will either have to pretokenize everything so we know how much memory to allocate // or allocate the tensor overhead as we go struct ggml_init_params params_diffs_wrapped = { - /*.mem_size =*/ ggml_tensor_overhead() * n_prompts * n_layers * 16u, + /*.mem_size =*/ ggml_tensor_overhead() * n_total_tokens, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -782,46 +813,35 @@ int main(int argc, char ** argv) { dctx.ctx_diff = ggml_init(params_diff); dctx.ctx_final = ggml_init(params_final); - // create templated prompts - for (int i = 0; i < n_prompts; ++i) { - populate_entries(cparams, cparams.positive_prompts[i], cparams.negative_prompts[i]); - } - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); int token_ct = 0; for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { - std::string positive_prompt = cparams.positive_entries[i]; - std::string negative_prompt = cparams.negative_entries[i]; - std::vector tokens_pos = ::llama_tokenize(ctx, positive_prompt, add_bos); - std::vector tokens_neg = ::llama_tokenize(ctx, negative_prompt, add_bos); - size_t max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); - padding_seq(ctx, tokens_pos, max_seq_len); - padding_seq(ctx, tokens_neg, max_seq_len); - cb_data.n_tokens = max_seq_len; + tokenized_prompt t = tokenized_prompts[i]; + cb_data.n_tokens = t.max_seq_len; // need to reload the model so it doesn't run out of context // this should scale with -c option passed by main - token_ct += 2 * max_seq_len; + token_ct += 2 * t.max_seq_len; if (token_ct > n_ctx || cparams.always_reload) { //break; llama_free(ctx); llama_free_model(model); std::tie(model, ctx) = llama_init_from_gpt_params(params); - token_ct = 2 * max_seq_len; + token_ct = 2 * t.max_seq_len; } if (token_ct > n_ctx) { fprintf(stderr, "context size exceeded on iteration %zu\n", i); break; } - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", positive_prompt.c_str(), negative_prompt.c_str(), max_seq_len); + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", t.positive.c_str(), t.negative.c_str(), t.max_seq_len); cb_data.is_eval_pos = true; - get_hidden_layers(ctx, tokens_pos); + get_hidden_layers(ctx, t.tokens_pos); cb_data.is_eval_pos = false; - get_hidden_layers(ctx, tokens_neg); + get_hidden_layers(ctx, t.tokens_neg); calc_diff(cb_data, dctx); @@ -861,7 +881,8 @@ int main(int argc, char ** argv) { export_gguf(dctx, n_layers, cparams.outfile, model_hint); llama_backend_free(); - std::cout << "okay which of you is failing" << std::endl; + + printf("confirm we got here\n"); // TODO free(): invalid pointer after the entire program is done???????? // probably because destructors free after you've already manually freed