From a0aae528bbfc2144ee3d58fc81326df7a14181cf Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Wed, 31 Jul 2024 12:42:30 -0400 Subject: [PATCH] Minor fixes --- examples/CMakeLists.txt | 2 +- examples/speculative/speculative.cpp | 7 ++++--- ggml/src/ggml.c | 4 ++-- include/llama.h | 2 ++ src/llama.cpp | 9 +++++++++ 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 247d52c6d..67b3d2774 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,6 +50,6 @@ else() endif() add_subdirectory(save-load-state) add_subdirectory(simple) - #add_subdirectory(speculative) + add_subdirectory(speculative) add_subdirectory(tokenize) endif() diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b051a18f1..1616edecb 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -73,10 +73,11 @@ int main(int argc, char ** argv) { // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.n_threads_draft > 0) { - params.n_threads = params.n_threads_draft; + if (params.draft_cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.draft_cpuparams.n_threads; } - params.n_threads_batch = params.n_threads_batch_draft; + + params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; llama_init_result llama_init_dft = llama_init_from_gpt_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7d8d17918..af62eb922 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18737,7 +18737,7 @@ static bool __thread_affinity(const bool * mask) { for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { if (mask[i]) { - printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); CPU_SET(i, &cpuset); } } @@ -19130,7 +19130,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) if (threadpool->poll) { while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) { // No new work. Yield and keep polling. - //__cpu_relax(); + __cpu_relax(); } } else { ggml_mutex_lock_shared(&threadpool->mutex); diff --git a/include/llama.h b/include/llama.h index b569c58e9..90b68f812 100644 --- a/include/llama.h +++ b/include/llama.h @@ -439,6 +439,8 @@ extern "C" { LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx); LLAMA_API void llama_detach_threadpools(struct llama_context * ctx); + // Pauses all attached threadpools + LLAMA_API void llama_pause_threadpools(struct llama_context * ctx); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); diff --git a/src/llama.cpp b/src/llama.cpp index 695b5eb00..8e0ccaac6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17526,6 +17526,15 @@ void llama_detach_threadpools(struct llama_context * ctx) { llama_detach_batch_threadpool(ctx); } +void llama_pause_threadpools(struct llama_context * ctx) { + if (ctx->threadpool) { + ggml_pause_threadpool(ctx->threadpool); + } + if (ctx->threadpool_batch) { + ggml_pause_threadpool(ctx->threadpool_batch); + } +} + void llama_backend_free(void) { ggml_quantize_free(); }