remove unnecessary multithreading
This commit is contained in:
parent
544268888b
commit
3090c485b6
1 changed files with 0 additions and 97 deletions
|
@ -17,15 +17,6 @@ struct diff_wrapper {
|
||||||
size_t n_rows; // number of rows in the matrix for size calculation
|
size_t n_rows; // number of rows in the matrix for size calculation
|
||||||
};
|
};
|
||||||
|
|
||||||
/* TODO part of multithreading
|
|
||||||
struct tokens_pair {
|
|
||||||
size_t max_seq_len;
|
|
||||||
std::string positive;
|
|
||||||
std::string negative;
|
|
||||||
std::vector<llama_token> tokens_pos;
|
|
||||||
std::vector<llama_token> tokens_neg;
|
|
||||||
}; */
|
|
||||||
|
|
||||||
struct callback_data {
|
struct callback_data {
|
||||||
std::vector<uint8_t> data;
|
std::vector<uint8_t> data;
|
||||||
|
|
||||||
|
@ -54,8 +45,6 @@ struct callback_data {
|
||||||
struct ctrl_params {
|
struct ctrl_params {
|
||||||
/* default meta parameters */
|
/* default meta parameters */
|
||||||
bool always_reload = false;
|
bool always_reload = false;
|
||||||
// TODO part of multithreading
|
|
||||||
// bool max_batch = false;
|
|
||||||
int n_completions = 64;
|
int n_completions = 64;
|
||||||
int n_threads = 8;
|
int n_threads = 8;
|
||||||
|
|
||||||
|
@ -95,8 +84,6 @@ static void print_usage(const char * executable) {
|
||||||
printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n");
|
printf(" -t, --num-threads N number of threads to use (do not confuse with gpt-opts -t)\n");
|
||||||
printf(" default: 8\n");
|
printf(" default: 8\n");
|
||||||
printf(" --always-reload reload the model for every new template to parse\n");
|
printf(" --always-reload reload the model for every new template to parse\n");
|
||||||
// TODO part of multithreading
|
|
||||||
//printf(" --max-batch maximize batch sizes, rather than optimizing for multithreading\n");
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("gpt-opts:\n");
|
printf("gpt-opts:\n");
|
||||||
printf(" other options from main\n");
|
printf(" other options from main\n");
|
||||||
|
@ -186,11 +173,6 @@ static int ctrlvec_params_parse_ex(int argc, char ** argv, ctrl_params & params)
|
||||||
params.always_reload = true;
|
params.always_reload = true;
|
||||||
skipme += 1;
|
skipme += 1;
|
||||||
}
|
}
|
||||||
/* TODO part of multithreading
|
|
||||||
if (arg == "--max-batch") {
|
|
||||||
params.max_batch = true;
|
|
||||||
skipme += 1;
|
|
||||||
} */
|
|
||||||
// TODO it might be nice QoL to have single positive/negative args
|
// TODO it might be nice QoL to have single positive/negative args
|
||||||
// we do not handle any other unknown arguments here because they will be handled by gpt_parse_params
|
// we do not handle any other unknown arguments here because they will be handled by gpt_parse_params
|
||||||
}
|
}
|
||||||
|
@ -251,61 +233,6 @@ static void populate_entries(ctrl_params & cparams, std::string positive, std::s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO part of multithreading
|
|
||||||
static size_t tokenize_pair(tokens_pair & tp, llama_context * ctx, const std::string & pos, const std::string & neg, const bool add_bos) {
|
|
||||||
tp.positive = pos;
|
|
||||||
tp.negative = neg;
|
|
||||||
tp.tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
|
||||||
tp.tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
|
||||||
tp.max_seq_len = std::max(tp.tokens_pos.size(), tp.tokens_neg.size());
|
|
||||||
padding_seq(ctx, tp.tokens_pos, tp.max_seq_len);
|
|
||||||
padding_seq(ctx, tp.tokens_neg, tp.max_seq_len);
|
|
||||||
return 2 * max_seq_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
// current batching strategy works as follows:
|
|
||||||
// each batch runs on one model load, since we reload the model after every batch to clear context
|
|
||||||
// therefore each batch must be small enough to fit in the context size
|
|
||||||
// we try to make the batches multiples of thread count so threads are used most efficiently
|
|
||||||
static std::vector<std::vector<tokens_pair>> batch_prompts(llama_context * ctx, ctrl_params & cparams, int n_ctx, const bool add_bos) {
|
|
||||||
std::vector<std::vector<tokens_pair>> batched_prompts;
|
|
||||||
std::vector<tokens_pair> thread_batch;
|
|
||||||
std::vector<tokens_pair> batch;
|
|
||||||
size_t n_batch_tokens = 0;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < cparams.positive_entries.size(); ++i) {
|
|
||||||
tokens_pair tp;
|
|
||||||
size_t n_tokens = tokenize_pair(tp, ctx, cparams.positive_entries[i], cparams.negative_entries[i], add_bos);
|
|
||||||
n_batch_tokens += n_tokens;
|
|
||||||
|
|
||||||
if (n_batch_tokens > n_ctx) {
|
|
||||||
if (cparams.max_batch) {
|
|
||||||
batch.insert(batch.end(), thread_batch.begin(), thread_batch.end());
|
|
||||||
thread_batch.clear();
|
|
||||||
}
|
|
||||||
batched_prompts.push_back(batch);
|
|
||||||
batch.clear();
|
|
||||||
n_batch_tokens = n_tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
thread_batch.push_back(tp);
|
|
||||||
|
|
||||||
if (thread_batch.size() >= cparams.n_threads) {
|
|
||||||
batch.insert(batch.end(), thread_batch.begin(), thread_batch.end());
|
|
||||||
thread_batch.clear();;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!thread_batch.empty()) {
|
|
||||||
batch.insert(batch.end(), thread_batch.begin(), thread_batch.end());
|
|
||||||
}
|
|
||||||
if (!batch.empty()) {
|
|
||||||
batched_prompts.push_back(batch);
|
|
||||||
}
|
|
||||||
|
|
||||||
return batched_prompts;
|
|
||||||
} */
|
|
||||||
|
|
||||||
static std::string ggml_ne_string(const ggml_tensor * t) {
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
std::string str;
|
std::string str;
|
||||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
@ -675,30 +602,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
/* TODO part of multithreading
|
|
||||||
std::vector<std::vector<tokens_pair>> & batched_prompts = batch_prompts(ctx, cparams, n_ctx, add_bos);
|
|
||||||
std::vector<std::thread> threads;
|
|
||||||
auto worker_function = [&](tokens_pair & tp) {
|
|
||||||
printf("Evaluating prompt: \"%s\" - \"%s\" (%ld tokens)\n", tp.positive.c_str(), tp.negative.c_str(), tp.max_seq_len);
|
|
||||||
// TODO so how do we deal with this?
|
|
||||||
// TODO we only have one cb_data object that everything gets passed to. so we need to be able to write to a different object per thread
|
|
||||||
// TODO but there's only one cb_eval function used as callback by the model... help wanted
|
|
||||||
};
|
|
||||||
printf("Batching prompts...\n");
|
|
||||||
for (int i = 0; i < batched_prompts.size(); ++i) {
|
|
||||||
for (int j = 0; j < batched_prompts[i].size(); ++j) {
|
|
||||||
threads.emplace_back(worker_function, batched_prompts[i][j]);
|
|
||||||
}
|
|
||||||
for (auto & th : threads) th.join();
|
|
||||||
|
|
||||||
// reload model for next batch
|
|
||||||
llama_free(ctx);
|
|
||||||
llama_free_model(model);
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
||||||
}
|
|
||||||
printf("Done with batching prompts.\n");
|
|
||||||
*/
|
|
||||||
|
|
||||||
int token_ct = 0;
|
int token_ct = 0;
|
||||||
|
|
||||||
for(size_t i = 0; i < cparams.positive_entries.size(); ++i) {
|
for(size_t i = 0; i < cparams.positive_entries.size(); ++i) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue