Minor fixes

2024-07-31 12:42:30 -04:00 · 2024-07-31 12:42:30 -04:00 · a0aae528bb
commit a0aae528bb
parent 130adf8415
5 changed files with 18 additions and 6 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -50,6 +50,6 @@ else()
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
-    #add_subdirectory(speculative)
+    add_subdirectory(speculative)
    add_subdirectory(tokenize)
 endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
    // load the draft model
    params.model = params.model_draft;
    params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
    }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
    llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
    model_dft = llama_init_dft.model;
    ctx_dft = llama_init_dft.context;
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -18737,7 +18737,7 @@ static bool __thread_affinity(const bool * mask) {

    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
        if (mask[i]) {
-            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
            CPU_SET(i, &cpuset);
        }
    }
@ -19130,7 +19130,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
        if (threadpool->poll) {
            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
                // No new work. Yield and keep polling.
-                //__cpu_relax();
+                __cpu_relax();
            }
        } else {
            ggml_mutex_lock_shared(&threadpool->mutex);
--- a/include/llama.h
+++ b/include/llama.h
@ -439,6 +439,8 @@ extern "C" {
    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);

+    // Pauses all attached threadpools
+    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);

    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17526,6 +17526,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
    llama_detach_batch_threadpool(ctx);
 }

+void llama_pause_threadpools(struct llama_context * ctx) {
+    if (ctx->threadpool) {
+        ggml_pause_threadpool(ctx->threadpool);
+    }
+    if (ctx->threadpool_batch) {
+        ggml_pause_threadpool(ctx->threadpool_batch);
+    }
+}
+
 void llama_backend_free(void) {
    ggml_quantize_free();
 }