diff --git a/llama.cpp b/llama.cpp index 5c1b21170..3af050c25 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9265,14 +9265,14 @@ struct llama_context_params llama_context_default_params() { /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, /*.yarn_orig_ctx =*/ 0, + /*.cb_eval =*/ nullptr, + /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, /*.mul_mat_q =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, - /*.cb_eval =*/ nullptr, - /*.cb_eval_user_data =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 9f7a51a0f..e268d7a1d 100644 --- a/llama.h +++ b/llama.h @@ -232,6 +232,9 @@ extern "C" { float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; + enum ggml_type type_k; // data type for K cache enum ggml_type type_v; // data type for V cache @@ -240,9 +243,6 @@ extern "C" { bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - - ggml_backend_sched_eval_callback cb_eval; - void * cb_eval_user_data; }; // model quantization parameters