From 43f1d316f5fb89ccbee7b0a6af93cf94dc580f61 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 22 May 2024 15:11:01 +0200 Subject: [PATCH] llama : add getters for n_threads/n_threads_batch This commit adds two new functions to the llama API. The functions can be used to get the number of threads used for generating a single token and the number of threads used for prompt and batch processing (multiple tokens). The motivation for this is that we want to be able to get the number of threads that the a context is using. The main use case is for a testing/verification that the number of threads is set correctly. Signed-off-by: Daniel Bevenius --- llama.cpp | 8 ++++++++ llama.h | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/llama.cpp b/llama.cpp index 34137c7ad..bbf99b7d8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17183,6 +17183,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_ ctx->cparams.n_threads_batch = n_threads_batch; } +uint32_t llama_get_n_threads(struct llama_context * ctx) { + return ctx->cparams.n_threads; +} + +uint32_t llama_get_n_threads_batch(struct llama_context * ctx) { + return ctx->cparams.n_threads_batch; +} + void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; diff --git a/llama.h b/llama.h index b7bf2afcb..2d0c8b01f 100644 --- a/llama.h +++ b/llama.h @@ -759,6 +759,12 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + // Get the number of threads used for generation of a single token. + LLAMA_API uint32_t llama_get_n_threads(struct llama_context * ctx); + + // Get the number of threads used for prompt and batch processing (multiple token). + LLAMA_API uint32_t llama_get_n_threads_batch(struct llama_context * ctx); + // Set whether to use causal attention or not // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);