diff --git a/common/common.h b/common/common.h index c2b72f6de..a665716be 100644 --- a/common/common.h +++ b/common/common.h @@ -68,7 +68,7 @@ enum dimre_method { }; struct cpu_params { - int32_t n_threads = -1; + int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) @@ -214,7 +214,7 @@ struct gpt_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; std::string public_path = ""; diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e78f6b388..97622f4f4 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) struct benchmark_params_struct { - int32_t n_threads = 1; + int n_threads = 1; int32_t n_iterations = 10; }; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 42f6eb5d4..a64c1bc25 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -223,7 +223,7 @@ int main(int argc, char ** argv) { LOG("%s: llama threadpool init = n_threads = %d\n", __func__, - (int32_t) params.cpuparams.n_threads + (int) params.cpuparams.n_threads ); struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1afba914d..1f9e6756e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -629,7 +629,7 @@ extern "C" { struct ggml_threadpool_params { bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores bool mask_specified; // mask is non-empty - int32_t n_threads; // number of threads + int n_threads; // number of threads int32_t prio; // thread priority uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) bool strict_cpu; // strict cpu placement @@ -2028,7 +2028,7 @@ extern "C" { GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); - GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool); GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bf9c6b20c..2c8f5a7e3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1973,8 +1973,8 @@ struct ggml_compute_threadpool { atomic_bool pause; // Used for pausing the threadpool or individual threads struct ggml_compute_state * workers; // per thread state - int32_t n_threads_max; // number of threads in the pool - int32_t n_threads_cur; // number of threads used in the current graph + int n_threads_max; // number of threads in the pool + int n_threads_cur; // number of threads used in the current graph int32_t prio; // Scheduling priority uint32_t poll; // Polling level (0 - no polling) @@ -18859,7 +18859,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { #ifndef GGML_USE_OPENMP struct ggml_compute_state* workers = threadpool->workers; - const int32_t n_threads = threadpool->n_threads_max; + const int n_threads = threadpool->n_threads_max; ggml_mutex_lock(&threadpool->mutex); @@ -18869,7 +18869,7 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { ggml_cond_broadcast(&threadpool->cond); ggml_mutex_unlock(&threadpool->mutex); - for (int32_t j = 1; j < n_threads; j++) { + for (int j = 1; j < n_threads; j++) { int32_t rc = ggml_thread_join(workers[j].thrd, NULL); GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); UNUSED(rc); @@ -18925,11 +18925,11 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { struct ggml_cplan ggml_graph_plan( const struct ggml_cgraph * cgraph, - int32_t n_threads, + int n_threads, struct ggml_compute_threadpool * threadpool) { if (threadpool == NULL) { - GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads); + GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); } if (n_threads <= 0) { n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; @@ -19348,13 +19348,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); - int32_t n_threads = cplan->n_threads; + int n_threads = cplan->n_threads; struct ggml_compute_threadpool * threadpool = cplan->threadpool; bool disposable_threadpool = false; if (threadpool == NULL) { - GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %u\n", n_threads); + GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); disposable_threadpool = true; struct ggml_threadpool_params ttp = { diff --git a/include/llama.h b/include/llama.h index 90b68f812..7b103261d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -304,8 +304,8 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + int n_threads; // number of threads to use for generation + int n_threads_batch; // number of threads to use for batch processing enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id @@ -851,13 +851,13 @@ extern "C" { // Set the number of threads used for decoding // n_threads is the number of threads used for generation (single token) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) - LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch); // Get the number of threads used for generation of a single token. - LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx); + LLAMA_API int llama_n_threads(struct llama_context * ctx); // Get the number of threads used for prompt and batch processing (multiple token). - LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); + LLAMA_API int llama_n_threads_batch(struct llama_context * ctx); // Set whether the model is in embeddings mode or not // If true, embeddings will be returned but logits will not diff --git a/src/llama.cpp b/src/llama.cpp index 8d3e6aaf4..916d0f8c1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2373,8 +2373,8 @@ struct llama_cparams { uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing + int n_threads; // number of threads to use for generation + int n_threads_batch; // number of threads to use for batch processing float rope_freq_base; float rope_freq_scale; @@ -15530,7 +15530,7 @@ static std::pair llama_swap_threadpools( int32_t n_tokens) { const auto & cparams = lctx.cparams; - int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; ggml_compute_threadpool_t threadpool = nullptr; // nullptr -> disposable threadpool @@ -15665,7 +15665,7 @@ static int llama_decode_internal( std::pair threads = llama_swap_threadpools(lctx, n_tokens); - int32_t n_threads = threads.first; + int n_threads = threads.first; ggml_compute_threadpool_t threadpool = threads.second; GGML_ASSERT(n_threads > 0); @@ -15909,7 +15909,7 @@ static int llama_encode_internal( std::pair threads = llama_swap_threadpools(lctx, n_tokens); - int32_t n_threads = threads.first; + int n_threads = threads.first; ggml_compute_threadpool_t threadpool = threads.second; GGML_ASSERT(n_threads > 0); @@ -19448,16 +19448,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa } } -void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { +void llama_set_n_threads(struct llama_context * ctx, int n_threads, int n_threads_batch) { ctx->cparams.n_threads = n_threads; ctx->cparams.n_threads_batch = n_threads_batch; } -uint32_t llama_n_threads(struct llama_context * ctx) { +int llama_n_threads(struct llama_context * ctx) { return ctx->cparams.n_threads; } -uint32_t llama_n_threads_batch(struct llama_context * ctx) { +int llama_n_threads_batch(struct llama_context * ctx) { return ctx->cparams.n_threads_batch; }