llama : llama_perf + option to disable timings during decode (#9355)

* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2024-09-13 09:53:38 +03:00 · 2024-09-13 09:53:38 +03:00 · 0abc6a2c25
commit 0abc6a2c25
parent bd35cb0ae3
23 changed files with 135 additions and 91 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

-    lparams.no_perf = false; // TODO: control via params
+    lparams.no_perf = params.no_perf;

    auto * result = new gpt_sampler {
        /* .params = */ params,
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
    // TODO: measure grammar performance

    if (gsmpl) {
-        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+        llama_perf_sampler_print(gsmpl->chain);
    }
    if (ctx) {
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx);
    }
 }