From 544268888bc2136e3e7a165d6ef6fea5bb857774 Mon Sep 17 00:00:00 2001 From: Christian Zhou-Zheng Date: Sat, 1 Jun 2024 17:25:21 -0400 Subject: [PATCH] in-series multithreading for prompt embedding? added commented-out code to attempt to start implementing mutlithreading for embedding in main --- .../control-vector-generator.cpp | 115 ++++++++++++++++-- 1 file changed, 108 insertions(+), 7 deletions(-) diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 1e6d6b5e0..1f55ba5fa 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -17,6 +17,15 @@ struct diff_wrapper { size_t n_rows; // number of rows in the matrix for size calculation }; +/* TODO part of multithreading +struct tokens_pair { + size_t max_seq_len; + std::string positive; + std::string negative; + std::vector tokens_pos; + std::vector tokens_neg; +}; */ + struct callback_data { std::vector data; @@ -45,6 +54,8 @@ struct callback_data { struct ctrl_params { /* default meta parameters */ bool always_reload = false; + // TODO part of multithreading + // bool max_batch = false; int n_completions = 64; int n_threads = 8; @@ -84,6 +95,8 @@ static void print_usage(const char * executable) { printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); printf(" default: 8\n"); printf(" --always-reload reload the model for every new template to parse\n"); + // TODO part of multithreading + //printf(" --max-batch maximize batch sizes, rather than optimizing for multithreading\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -173,6 +186,11 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.always_reload = true; skipme += 1; } + /* TODO part of multithreading + if (arg == "--max-batch") { + params.max_batch = true; + skipme += 1; + } */ // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } @@ -209,10 +227,10 @@ static std::vector ctrlvec_load_prompt_file(std::string path) { } static std::string format_template(std::string persona, std::string suffix) { - const std::string user_tag = "[INST]"; - const std::string asst_tag = "[/INST]"; - // TODO make this dynamic - allow the user to change it somehow - and adapt based on model + //const std::string user_tag = "[INST]"; + //const std::string asst_tag = "[/INST]"; //return user_tag + " Act as if you're extremely " + persona + ". " + asst_tag + " " + suffix; + // TODO make this dynamic - allow the user to change it somehow - and adapt based on model return persona + " " + suffix; // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" } @@ -233,6 +251,61 @@ static void populate_entries(ctrl_params & cparams, std::string positive, std::s } } +/* TODO part of multithreading +static size_t tokenize_pair(tokens_pair & tp, llama_context * ctx, const std::string & pos, const std::string & neg, const bool add_bos) { + tp.positive = pos; + tp.negative = neg; + tp.tokens_pos = ::llama_tokenize(ctx, pos, add_bos); + tp.tokens_neg = ::llama_tokenize(ctx, neg, add_bos); + tp.max_seq_len = std::max(tp.tokens_pos.size(), tp.tokens_neg.size()); + padding_seq(ctx, tp.tokens_pos, tp.max_seq_len); + padding_seq(ctx, tp.tokens_neg, tp.max_seq_len); + return 2 * max_seq_len; +} + +// current batching strategy works as follows: +// each batch runs on one model load, since we reload the model after every batch to clear context +// therefore each batch must be small enough to fit in the context size +// we try to make the batches multiples of thread count so threads are used most efficiently +static std::vector> batch_prompts(llama_context * ctx, ctrl_params & cparams, int n_ctx, const bool add_bos) { + std::vector> batched_prompts; + std::vector thread_batch; + std::vector batch; + size_t n_batch_tokens = 0; + + for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { + tokens_pair tp; + size_t n_tokens = tokenize_pair(tp, ctx, cparams.positive_entries[i], cparams.negative_entries[i], add_bos); + n_batch_tokens += n_tokens; + + if (n_batch_tokens > n_ctx) { + if (cparams.max_batch) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + thread_batch.clear(); + } + batched_prompts.push_back(batch); + batch.clear(); + n_batch_tokens = n_tokens; + } + + thread_batch.push_back(tp); + + if (thread_batch.size() >= cparams.n_threads) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + thread_batch.clear();; + } + } + + if (!thread_batch.empty()) { + batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); + } + if (!batch.empty()) { + batched_prompts.push_back(batch); + } + + return batched_prompts; +} */ + static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -387,13 +460,14 @@ static void concatenate_diffs(callback_data & cb_data) { // BEGIN NON-GGML IMPLEMENTATION // TODO translate to ggml -// this probably doesn't want to be here - put it into the compute graph as a step in processing each layer +// this probably doesn't want to be a separate function - put it into the compute graph as a step in processing each layer static float* square_diff(callback_data & cb_data, size_t idx) { float* result = new float[cb_data.n_embd * cb_data.n_embd]; std::memset(result, 0, cb_data.n_embd * cb_data.n_embd * sizeof(float)); for (size_t i = 0; i < (size_t) cb_data.n_embd; i++) { for (size_t j = 0; j < (size_t) cb_data.n_embd; j++) { float sum = 0.0f; + // watch out for indexing - can't just use cb_data.n_tokens for (size_t k = 0; k < cb_data.v_diff[idx].n_rows; k++) { sum += cb_data.v_diff[idx].diff[i + cb_data.n_embd * k] * cb_data.v_diff[idx].diff[j + cb_data.n_embd * k]; } @@ -560,6 +634,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "number of positive and negative prompts must be equal"); return 1; } + if (cparams.positive_prompts.empty()) { + fprintf(stderr, "must provide at least one prompt pair"); + return 1; + } callback_data cb_data; @@ -578,6 +656,7 @@ int main(int argc, char ** argv) { llama_context * ctx; std::tie(model, ctx) = llama_init_from_gpt_params(params); + int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); int n_embd = llama_n_embd(model); cb_data.n_embd = n_embd; @@ -596,10 +675,32 @@ int main(int argc, char ** argv) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - int token_ct = 0; - int n_ctx = llama_n_ctx(ctx); + /* TODO part of multithreading + std::vector> & batched_prompts = batch_prompts(ctx, cparams, n_ctx, add_bos); + std::vector threads; + auto worker_function = [&](tokens_pair & tp) { + printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", tp.positive.c_str(), tp.negative.c_str(), tp.max_seq_len); + // TODO so how do we deal with this? + // TODO we only have one cb_data object that everything gets passed to. so we need to be able to write to a different object per thread + // TODO but there's only one cb_eval function used as callback by the model... help wanted + }; + printf("Batching prompts...\n"); + for (int i = 0; i < batched_prompts.size(); ++i) { + for (int j = 0; j < batched_prompts[i].size(); ++j) { + threads.emplace_back(worker_function, batched_prompts[i][j]); + } + for (auto & th : threads) th.join(); + + // reload model for next batch + llama_free(ctx); + llama_free_model(model); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + } + printf("Done with batching prompts.\n"); + */ + + int token_ct = 0; - // TODO multithread this for(size_t i = 0; i < cparams.positive_entries.size(); ++i) { std::string positive_prompt = cparams.positive_entries[i]; std::string negative_prompt = cparams.negative_entries[i];