diff --git a/examples/common.cpp b/examples/common.cpp index f1c3bae13..558c00de4 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -91,6 +91,7 @@ void process_escapes(std::string& input) { } bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + bool ethreads_set = false; bool invalid_param = false; bool escape_prompt = false; std::string arg; @@ -114,6 +115,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_threads = std::stoi(argv[i]); + } else if (arg == "-e" || arg == "--eval-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ethreads = std::stoi(argv[i]); + ethreads_set = true; } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -339,6 +347,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(1); } } + + // ensure that n_ethreads defaults to the system thread-max only when n_threads is not set + if (!ethreads_set) { + params.n_ethreads = params.n_threads; + } + if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); diff --git a/examples/common.h b/examples/common.h index 842e1516f..3efd790c9 100644 --- a/examples/common.h +++ b/examples/common.h @@ -18,6 +18,7 @@ int32_t get_num_physical_cores(); struct gpt_params { int32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_ethreads = get_num_physical_cores(); int32_t n_predict = -1; // new tokens to predict int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e4b729128..165a914d9 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -71,7 +71,7 @@ int main(int argc, char ** argv) { if (params.embedding){ if (embd_inp.size() > 0) { - if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) { + if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads, params.n_ethreads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 5ac151e14..4f2fe75e8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -123,12 +123,12 @@ int main(int argc, char ** argv) { if (params.mem_test) { { const std::vector tmp(params.n_batch, 0); - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads, params.n_ethreads); } { const std::vector tmp = { 0, }; - llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads, params.n_ethreads); } llama_print_timings(ctx); @@ -360,7 +360,7 @@ int main(int argc, char ** argv) { if (n_eval > params.n_batch) { n_eval = params.n_batch; } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { + if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads, params.n_ethreads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return 1; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 299a19999..20d99a71b 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -41,10 +41,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) { std::vector logits; int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch; auto start_t = std::chrono::high_resolution_clock::now(); + for (int j = 0; j < num_batches; ++j) { int batch_start = start + j * params.n_batch; int batch_size = std::min(end - batch_start, params.n_batch); - if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) { + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads, params.n_ethreads)) { fprintf(stderr, "%s : failed to eval\n", __func__); return; } diff --git a/llama.cpp b/llama.cpp index c36c6ced6..f97e6a277 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1045,13 +1045,15 @@ static bool llama_model_load( // - tokens: new batch of tokens to process // - n_past: the context size so far // - n_threads: number of threads to use +// - n_ethreads: number of threads to use for single-token eval // static bool llama_eval_internal( llama_context & lctx, const llama_token * tokens, const int n_tokens, const int n_past, - const int n_threads) { + const int n_threads, + const int n_ethreads) { const int64_t t_start_us = ggml_time_us(); const int N = n_tokens; @@ -1084,7 +1086,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : (N == 1 ? n_ethreads : n_threads); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); @@ -2722,8 +2724,9 @@ int llama_eval( const llama_token * tokens, int n_tokens, int n_past, - int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { + int n_threads, + int n_ethreads) { + if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads, n_ethreads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } diff --git a/llama.h b/llama.h index 58c6e0699..99a05d1bb 100644 --- a/llama.h +++ b/llama.h @@ -153,7 +153,8 @@ extern "C" { const llama_token * tokens, int n_tokens, int n_past, - int n_threads); + int n_threads, + int n_ethreads); // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens.