diff --git a/examples/control-vector-generator/control-vector-generator.cpp b/examples/control-vector-generator/control-vector-generator.cpp index 1f55ba5fa..3cd42fa82 100644 --- a/examples/control-vector-generator/control-vector-generator.cpp +++ b/examples/control-vector-generator/control-vector-generator.cpp @@ -17,15 +17,6 @@ struct diff_wrapper { size_t n_rows; // number of rows in the matrix for size calculation }; -/* TODO part of multithreading -struct tokens_pair { - size_t max_seq_len; - std::string positive; - std::string negative; - std::vector tokens_pos; - std::vector tokens_neg; -}; */ - struct callback_data { std::vector data; @@ -54,8 +45,6 @@ struct callback_data { struct ctrl_params { /* default meta parameters */ bool always_reload = false; - // TODO part of multithreading - // bool max_batch = false; int n_completions = 64; int n_threads = 8; @@ -95,8 +84,6 @@ static void print_usage(const char * executable) { printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n"); printf(" default: 8\n"); printf(" --always-reload reload the model for every new template to parse\n"); - // TODO part of multithreading - //printf(" --max-batch maximize batch sizes, rather than optimizing for multithreading\n"); printf("\n"); printf("gpt-opts:\n"); printf(" other options from main\n"); @@ -186,11 +173,6 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params) params.always_reload = true; skipme += 1; } - /* TODO part of multithreading - if (arg == "--max-batch") { - params.max_batch = true; - skipme += 1; - } */ // TODO it might be nice QoL to have single positive/negative args // we do not handle any other unknown arguments here because they will be handled by gpt_parse_params } @@ -251,61 +233,6 @@ static void populate_entries(ctrl_params & cparams, std::string positive, std::s } } -/* TODO part of multithreading -static size_t tokenize_pair(tokens_pair & tp, llama_context * ctx, const std::string & pos, const std::string & neg, const bool add_bos) { - tp.positive = pos; - tp.negative = neg; - tp.tokens_pos = ::llama_tokenize(ctx, pos, add_bos); - tp.tokens_neg = ::llama_tokenize(ctx, neg, add_bos); - tp.max_seq_len = std::max(tp.tokens_pos.size(), tp.tokens_neg.size()); - padding_seq(ctx, tp.tokens_pos, tp.max_seq_len); - padding_seq(ctx, tp.tokens_neg, tp.max_seq_len); - return 2 * max_seq_len; -} - -// current batching strategy works as follows: -// each batch runs on one model load, since we reload the model after every batch to clear context -// therefore each batch must be small enough to fit in the context size -// we try to make the batches multiples of thread count so threads are used most efficiently -static std::vector> batch_prompts(llama_context * ctx, ctrl_params & cparams, int n_ctx, const bool add_bos) { - std::vector> batched_prompts; - std::vector thread_batch; - std::vector batch; - size_t n_batch_tokens = 0; - - for (size_t i = 0; i < cparams.positive_entries.size(); ++i) { - tokens_pair tp; - size_t n_tokens = tokenize_pair(tp, ctx, cparams.positive_entries[i], cparams.negative_entries[i], add_bos); - n_batch_tokens += n_tokens; - - if (n_batch_tokens > n_ctx) { - if (cparams.max_batch) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - thread_batch.clear(); - } - batched_prompts.push_back(batch); - batch.clear(); - n_batch_tokens = n_tokens; - } - - thread_batch.push_back(tp); - - if (thread_batch.size() >= cparams.n_threads) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - thread_batch.clear();; - } - } - - if (!thread_batch.empty()) { - batch.insert(batch.end(), thread_batch.begin(), thread_batch.end()); - } - if (!batch.empty()) { - batched_prompts.push_back(batch); - } - - return batched_prompts; -} */ - static std::string ggml_ne_string(const ggml_tensor * t) { std::string str; for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -675,30 +602,6 @@ int main(int argc, char ** argv) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - /* TODO part of multithreading - std::vector> & batched_prompts = batch_prompts(ctx, cparams, n_ctx, add_bos); - std::vector threads; - auto worker_function = [&](tokens_pair & tp) { - printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", tp.positive.c_str(), tp.negative.c_str(), tp.max_seq_len); - // TODO so how do we deal with this? - // TODO we only have one cb_data object that everything gets passed to. so we need to be able to write to a different object per thread - // TODO but there's only one cb_eval function used as callback by the model... help wanted - }; - printf("Batching prompts...\n"); - for (int i = 0; i < batched_prompts.size(); ++i) { - for (int j = 0; j < batched_prompts[i].size(); ++j) { - threads.emplace_back(worker_function, batched_prompts[i][j]); - } - for (auto & th : threads) th.join(); - - // reload model for next batch - llama_free(ctx); - llama_free_model(model); - std::tie(model, ctx) = llama_init_from_gpt_params(params); - } - printf("Done with batching prompts.\n"); - */ - int token_ct = 0; for(size_t i = 0; i < cparams.positive_entries.size(); ++i) {