llama : llama_perf + option to disable timings during decode (#9355)

* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
2024-09-13 09:53:38 +03:00 · 2024-09-13 09:53:38 +03:00 · 0abc6a2c25
commit 0abc6a2c25
parent bd35cb0ae3
23 changed files with 135 additions and 91 deletions
--- a/common/common.h
+++ b/common/common.h
@ -124,6 +124,7 @@ struct gpt_sampler_params {
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = false; // consider newlines as a repeatable token
    bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics

    std::vector<enum gpt_sampler_type> samplers = {
        GPT_SAMPLER_TYPE_TOP_K,
@ -246,6 +247,7 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // disable performance metrics

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch