diff --git a/examples/common.cpp b/examples/common.cpp index 7502c87ea..c531639fb 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -534,7 +534,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -ppt N, --pp-threads N\n"); - fprintf(stdout, " number of threads to use during prompt processing (default is equal to --threads)\n"); + fprintf(stdout, " number of threads to use during prompt processing (default: %d)\n", params.pp_threads); fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); fprintf(stdout, " prompt to start generation with (default: empty)\n"); fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index dba3bdab2..59fc8a295 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -133,8 +133,8 @@ int main(int argc, char ** argv) { // print system information { fprintf(stderr, "\n"); - fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", - params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + fprintf(stderr, "system_info: n_threads = %d / %d | pp_threads = %d / %d | %s\n", + params.n_threads, std::thread::hardware_concurrency(), params.pp_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 61c71c358..4821fad5d 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -56,7 +56,7 @@ int main(int argc, char ** argv) { } // evaluate prompt - llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); + llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads, params.pp_threads); last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); n_past += n_prompt_tokens; @@ -93,7 +93,7 @@ int main(int argc, char ** argv) { last_n_tokens_data.push_back(next_token); printf("%s", next_token_str); - if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { + if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads, params.pp_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx); llama_free_model(model); @@ -153,7 +153,7 @@ int main(int argc, char ** argv) { last_n_tokens_data.push_back(next_token); printf("%s", next_token_str); - if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { + if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads, params.pp_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx2); llama_free_model(model);