backend : add eval callback (#4935)

* backend : add eval callback ggml-ci * backend : group nodes in a single compute when user don't need them * backend : clean-up the implementation ggml-ci * simple : do not perform tensor data copy if not needed * simple : fix * simple : no need for ggml_is_contiguous + fix bool parse * llama : fix callback placement in llama_context_params * backend : avoid double-ask callback calls * simple : restore examples, imatrix will serve as a demo
2024-01-17 18:39:41 +02:00 · 2024-01-17 18:39:41 +02:00 · 44a1a4a41a
commit 44a1a4a41a
parent c918fe8dca
4 changed files with 64 additions and 2 deletions
--- a/llama.h
+++ b/llama.h
@ -2,6 +2,7 @@
 #define LLAMA_H

 #include "ggml.h"
+#include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@ -231,6 +232,9 @@ extern "C" {
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
        enum ggml_type type_k; // data type for K cache
        enum ggml_type type_v; // data type for V cache