using abort_callback from ggml to stop llama computation

2024-02-08 11:37:30 +01:00 · 2024-02-08 11:37:30 +01:00 · e3ac833d3f
commit e3ac833d3f
parent 67fd33132f
3 changed files with 22 additions and 3 deletions
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -80,7 +80,6 @@ extern "C" {
    //
    // CPU backend
    //
-
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);

    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
--- a/llama.cpp
+++ b/llama.cpp
@ -1948,6 +1948,9 @@ struct llama_context {
    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_sched_t sched = nullptr;

+    ggml_abort_callback abort_callback = nullptr;
+    void *              abort_callback_data = nullptr;
+
    // input tensors
    ggml_backend_buffer_t buf_input = nullptr;
    ggml_context * ctx_input = nullptr;
@ -7847,6 +7850,7 @@ static void llama_graph_compute(

    if (lctx.backend_cpu != nullptr) {
        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
    }

    ggml_backend_sched_graph_compute(lctx.sched, gf);
@ -11644,6 +11648,8 @@ struct llama_context_params llama_context_default_params() {
        /*.embedding                   =*/ false,
        /*.offload_kqv                 =*/ true,
        /*.do_pooling                  =*/ true,
+        /*.abort_callback              =*/ nullptr,
+        /*.abort_callback_data         =*/ nullptr,
    };

    return result;
@ -11835,8 +11841,11 @@ struct llama_context * llama_new_context_with_model(
    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);

-    ctx->rng = std::mt19937(params.seed);
-    ctx->logits_all = params.logits_all;
+    ctx->abort_callback      = params.abort_callback;
+    ctx->abort_callback_data = params.abort_callback_data;
+
+    ctx->rng                 = std::mt19937(params.seed);
+    ctx->logits_all          = params.logits_all;

    const ggml_type type_k = params.type_k;
    const ggml_type type_v = params.type_v;
@ -12809,6 +12818,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
    ctx->cparams.n_threads_batch = n_threads_batch;
 }

+void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
+    ctx->abort_callback      = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
 struct llama_batch llama_batch_get_one(
             llama_token * tokens,
                 int32_t   n_tokens,
--- a/llama.h
+++ b/llama.h
@ -256,6 +256,9 @@ extern "C" {
        bool embedding;   // embedding mode only
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool do_pooling;  // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
+
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
    };

    // model quantization parameters
@ -661,6 +664,9 @@ extern "C" {
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

+    // Set abort callback
+    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
+
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Logits for which llama_batch.logits[i] == 0 are undefined