From a0aae528bbfc2144ee3d58fc81326df7a14181cf Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH] Minor fixes

---
 examples/CMakeLists.txt              | 2 +-
 examples/speculative/speculative.cpp | 7 ++++---
 ggml/src/ggml.c                      | 4 ++--
 include/llama.h                      | 2 ++
 src/llama.cpp                        | 9 +++++++++
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 247d52c6d..67b3d2774 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    #add_subdirectory(speculative)
+    add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b051a18f1..1616edecb 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
     }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
     llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7d8d17918..af62eb922 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18737,7 +18737,7 @@ static bool __thread_affinity(const bool * mask) {
 
     for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
         if (mask[i]) {
-            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
             CPU_SET(i, &cpuset);
         }
     }
@@ -19130,7 +19130,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
         if (threadpool->poll) {
             while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
                 // No new work. Yield and keep polling.
-                //__cpu_relax();
+                __cpu_relax();
             }
         } else {
             ggml_mutex_lock_shared(&threadpool->mutex);
diff --git a/include/llama.h b/include/llama.h
index b569c58e9..90b68f812 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -439,6 +439,8 @@ extern "C" {
     LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
     LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
 
+    // Pauses all attached threadpools
+    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
index 695b5eb00..8e0ccaac6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17526,6 +17526,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
     llama_detach_batch_threadpool(ctx);
 }
 
+void llama_pause_threadpools(struct llama_context * ctx) {
+    if (ctx->threadpool) {
+        ggml_pause_threadpool(ctx->threadpool);
+    }
+    if (ctx->threadpool_batch) {
+        ggml_pause_threadpool(ctx->threadpool_batch);
+    }
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }