threadpool: consistent use of int type for n_threads params

This commit is contained in:
Max Krasnyansky 2024-08-24 10:50:06 -07:00 committed by fmz
parent 2358bb364b
commit 4a4d71501b
7 changed files with 27 additions and 27 deletions

View file

@ -68,7 +68,7 @@ enum dimre_method {
}; };
struct cpu_params { struct cpu_params {
int32_t n_threads = -1; int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
@ -214,7 +214,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = "";

View file

@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct { struct benchmark_params_struct {
int32_t n_threads = 1; int n_threads = 1;
int32_t n_iterations = 10; int32_t n_iterations = 10;
}; };

View file

@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
LOG("%s: llama threadpool init = n_threads = %d\n", LOG("%s: llama threadpool init = n_threads = %d\n",
__func__, __func__,
(int32_t) params.cpuparams.n_threads (int) params.cpuparams.n_threads
); );
struct ggml_threadpool_params tpp_batch = struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);

View file

@ -629,7 +629,7 @@ extern "C" {
struct ggml_threadpool_params { struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
bool mask_specified; // mask is non-empty bool mask_specified; // mask is non-empty
int32_t n_threads; // number of threads int n_threads; // number of threads
int32_t prio; // thread priority int32_t prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement bool strict_cpu; // strict cpu placement
@ -2028,7 +2028,7 @@ extern "C" {
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params);
GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); GGML_API int ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool); GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool); GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool);

View file

@ -1973,8 +1973,8 @@ struct ggml_compute_threadpool {
atomic_bool pause; // Used for pausing the threadpool or individual threads atomic_bool pause; // Used for pausing the threadpool or individual threads
struct ggml_compute_state * workers; // per thread state struct ggml_compute_state * workers; // per thread state
int32_t n_threads_max; // number of threads in the pool int n_threads_max; // number of threads in the pool
int32_t n_threads_cur; // number of threads used in the current graph int n_threads_cur; // number of threads used in the current graph
int32_t prio; // Scheduling priority int32_t prio; // Scheduling priority
uint32_t poll; // Polling level (0 - no polling) uint32_t poll; // Polling level (0 - no polling)
@ -18859,7 +18859,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
#ifndef GGML_USE_OPENMP #ifndef GGML_USE_OPENMP
struct ggml_compute_state* workers = threadpool->workers; struct ggml_compute_state* workers = threadpool->workers;
const int32_t n_threads = threadpool->n_threads_max; const int n_threads = threadpool->n_threads_max;
ggml_mutex_lock(&threadpool->mutex); ggml_mutex_lock(&threadpool->mutex);
@ -18869,7 +18869,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
ggml_cond_broadcast(&threadpool->cond); ggml_cond_broadcast(&threadpool->cond);
ggml_mutex_unlock(&threadpool->mutex); ggml_mutex_unlock(&threadpool->mutex);
for (int32_t j = 1; j < n_threads; j++) { for (int j = 1; j < n_threads; j++) {
int32_t rc = ggml_thread_join(workers[j].thrd, NULL); int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
UNUSED(rc); UNUSED(rc);
@ -18925,11 +18925,11 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
struct ggml_cplan ggml_graph_plan( struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph, const struct ggml_cgraph * cgraph,
int32_t n_threads, int n_threads,
struct ggml_compute_threadpool * threadpool) { struct ggml_compute_threadpool * threadpool) {
if (threadpool == NULL) { if (threadpool == NULL) {
GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads); GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
} }
if (n_threads <= 0) { if (n_threads <= 0) {
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
@ -19348,13 +19348,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->n_threads > 0);
GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
int32_t n_threads = cplan->n_threads; int n_threads = cplan->n_threads;
struct ggml_compute_threadpool * threadpool = cplan->threadpool; struct ggml_compute_threadpool * threadpool = cplan->threadpool;
bool disposable_threadpool = false; bool disposable_threadpool = false;
if (threadpool == NULL) { if (threadpool == NULL) {
GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads); GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
disposable_threadpool = true; disposable_threadpool = true;
struct ggml_threadpool_params ttp = { struct ggml_threadpool_params ttp = {

View file

@ -304,8 +304,8 @@ extern "C" {
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation int n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing int n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
@ -851,13 +851,13 @@ extern "C" {
// Set the number of threads used for decoding // Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token) // n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch);
// Get the number of threads used for generation of a single token. // Get the number of threads used for generation of a single token.
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); LLAMA_API int llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API int llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not // Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not // If true, embeddings will be returned but logits will not

View file

@ -2373,8 +2373,8 @@ struct llama_cparams {
uint32_t n_batch; uint32_t n_batch;
uint32_t n_ubatch; uint32_t n_ubatch;
uint32_t n_seq_max; uint32_t n_seq_max;
uint32_t n_threads; // number of threads to use for generation int n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing int n_threads_batch; // number of threads to use for batch processing
float rope_freq_base; float rope_freq_base;
float rope_freq_scale; float rope_freq_scale;
@ -15530,7 +15530,7 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
int32_t n_tokens) { int32_t n_tokens) {
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
ggml_compute_threadpool_t threadpool = nullptr; // nullptr -> disposable threadpool ggml_compute_threadpool_t threadpool = nullptr; // nullptr -> disposable threadpool
@ -15665,7 +15665,7 @@ static int llama_decode_internal(
std::pair<int32_t, ggml_compute_threadpool_t> threads = std::pair<int32_t, ggml_compute_threadpool_t> threads =
llama_swap_threadpools(lctx, n_tokens); llama_swap_threadpools(lctx, n_tokens);
int32_t n_threads = threads.first; int n_threads = threads.first;
ggml_compute_threadpool_t threadpool = threads.second; ggml_compute_threadpool_t threadpool = threads.second;
GGML_ASSERT(n_threads > 0); GGML_ASSERT(n_threads > 0);
@ -15909,7 +15909,7 @@ static int llama_encode_internal(
std::pair<int32_t, ggml_compute_threadpool_t> threads = std::pair<int32_t, ggml_compute_threadpool_t> threads =
llama_swap_threadpools(lctx, n_tokens); llama_swap_threadpools(lctx, n_tokens);
int32_t n_threads = threads.first; int n_threads = threads.first;
ggml_compute_threadpool_t threadpool = threads.second; ggml_compute_threadpool_t threadpool = threads.second;
GGML_ASSERT(n_threads > 0); GGML_ASSERT(n_threads > 0);
@ -19448,16 +19448,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
} }
} }
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch) {
ctx->cparams.n_threads = n_threads; ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch; ctx->cparams.n_threads_batch = n_threads_batch;
} }
uint32_t llama_n_threads(struct llama_context * ctx) { int llama_n_threads(struct llama_context * ctx) {
return ctx->cparams.n_threads; return ctx->cparams.n_threads;
} }
uint32_t llama_n_threads_batch(struct llama_context * ctx) { int llama_n_threads_batch(struct llama_context * ctx) {
return ctx->cparams.n_threads_batch; return ctx->cparams.n_threads_batch;
} }