llama : add new llama_perf API

ggml-ci
2024-09-06 13:47:27 +03:00 · 2024-09-06 13:47:27 +03:00 · 757a9bf868
commit 757a9bf868
parent 5ab52c1f64
25 changed files with 149 additions and 101 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2533,7 +2533,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        }
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
-        llama_reset_timings(lctx, nullptr);
+        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
    }

    iparams.model   = model;
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -153,7 +153,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();

-    lparams.no_timing = false;
+    lparams.no_perf = false; // TODO: control via params

    auto * result = new gpt_sampler {
        /* .params = */ params,
@ -270,8 +270,15 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
    return gsmpl->prev.rat(0);
 }

-void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
-    llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr);
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+    // TODO: measure grammar performance
+
+    if (gsmpl) {
+        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    }
+    if (ctx) {
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    }
 }

 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -62,6 +62,8 @@ struct gpt_sampler_params {
 //  - grammar support
 //  - custom sampler logic based on the parameters
 //
+// TODO: measure grammar performance
+//
 struct gpt_sampler;

 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
@ -75,11 +77,9 @@ void gpt_sampler_reset (struct gpt_sampler * gsmpl);

 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);

-//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
-
 llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);

-void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);

 // extended sampling implementation:
 //
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -210,7 +210,8 @@ int main(int argc, char ** argv) {
        }
    }

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -231,7 +231,9 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -307,8 +307,10 @@ int main(int argc, char ** argv) {
        if (notArray) fprintf(stdout, "\n}\n");
    }

+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
    // clean up
-    llama_print_timings(ctx, nullptr);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -181,7 +181,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -171,8 +171,12 @@ int main(int argc, char * argv[]) {

    auto sparams = llama_sampler_chain_default_params();

+    sparams.no_perf = false;
+
    llama_sampler * smpl = llama_sampler_chain_init(sparams);

+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
    {
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -638,7 +638,8 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -81,7 +81,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }

@ -93,7 +93,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            gpt_print_timings(*g_ctx, *g_smpl);
+            gpt_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -634,7 +634,8 @@ int main(int argc, char ** argv) {
        fflush(stdout);
    }

-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    llama_free(ctx);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_print_timings(ctx, nullptr);
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

        llama_free(ctx);

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_print_timings(ctx_llava->ctx_llama, nullptr);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_print_timings(ctx_llava->ctx_llama, nullptr);
+            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
            }
        }
        printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama, nullptr);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -467,7 +467,8 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_accept  = %d\n", n_accept);

-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);

    gpt_sampler_free(smpl);

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -238,8 +238,9 @@ int main(int argc, char ** argv){
    LOG_TEE("n_accept     = %d\n", n_accept);
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ntarget:\n");
-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\ntarget:\n\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    gpt_sampler_free(smpl);

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -93,7 +93,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }

@ -106,7 +106,7 @@ static void sigint_handler(int signo) {
        } else {
            console::cleanup();
            printf("\n");
-            gpt_print_timings(*g_ctx, *g_smpl);
+            gpt_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
@ -929,7 +929,8 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    gpt_sampler_free(smpl);
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -414,7 +414,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n");

    // TODO: print sampling/grammar timings for all clients
-    llama_print_timings(ctx, nullptr);
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_batch_free(batch);

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -259,7 +259,8 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -76,7 +76,7 @@ static void write_logfile(
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);

-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }

@ -2048,7 +2048,8 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -293,8 +293,10 @@ int main(int argc, char ** argv) {
        }
    }

+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
    // clean up
-    llama_print_timings(ctx, nullptr);
    llama_batch_free(query_batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -57,8 +57,12 @@ int main(int argc, char ** argv) {

    auto sparams = llama_sampler_chain_default_params();

+    sparams.no_perf = false;
+
    llama_sampler * smpl = llama_sampler_chain_init(sparams);

+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
    // tokenize the prompt

    std::vector<llama_token> tokens_list;
@ -153,7 +157,9 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);

    fprintf(stderr, "\n");

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -613,12 +613,12 @@ int main(int argc, char ** argv) {
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ndraft:\n");
+    LOG_TEE("\ndraft:\n\n");
    // TODO: print sampling/grammar timings for all drafts
-    gpt_print_timings(ctx_dft, nullptr);
+    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);

-    LOG_TEE("\ntarget:\n");
-    gpt_print_timings(ctx_tgt, smpl);
+    LOG_TEE("\ntarget:\n\n");
+    gpt_perf_print(ctx_tgt, smpl);

    gpt_sampler_free(smpl);
    for (int s = 0; s < n_seq_dft; ++s) {
--- a/include/llama.h
+++ b/include/llama.h
@ -342,6 +342,7 @@ extern "C" {
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+      //bool no_perf;     // whether to measure performance timings, TODO: implement

        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@ -371,23 +372,9 @@ extern "C" {
    } llama_logit_bias;

    typedef struct llama_sampler_chain_params {
-        bool no_timing; // whether to measure performance timings
+        bool no_perf; // whether to measure performance timings
    } llama_sampler_chain_params;

-    // performance timing information
-    struct llama_timings {
-        double t_start_ms;
-        double t_end_ms;
-        double t_load_ms;
-        double t_sampler_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_sampler;
-        int32_t n_p_eval;
-        int32_t n_eval;
-    };
-
    // used in chat template
    typedef struct llama_chat_message {
        const char * role;
@ -1121,13 +1108,6 @@ extern "C" {
    //  Returns the split_prefix length.
    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);

-    // Performance information
-    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
-
-    // note: requires llama_sampler_chain. how to prevent misuse?
-    LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain);
-    LLAMA_API void llama_reset_timings(      struct llama_context * ctx,       struct llama_sampler * chain);
-
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);

@ -1135,7 +1115,21 @@ extern "C" {
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);

-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+    //
+    // Performance utils
+    //
+    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    //
+
+    enum llama_perf_type {
+        LLAMA_PERF_TYPE_CONTEXT       = 0,
+        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
+    };
+
+    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+
+    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);

 #ifdef __cplusplus
 }
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -511,7 +511,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
        auto * chain = (llama_sampler_chain *) smpl->ctx;

-        time_meas tm(chain->t_sample_us, chain->params.no_timing);
+        time_meas tm(chain->t_sample_us, chain->params.no_perf);

        for (auto * smpl : chain->samplers) {
            llama_sampler_accept(smpl, token);
@ -522,7 +522,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
        auto * chain = (llama_sampler_chain *) smpl->ctx;

-        time_meas tm(chain->t_sample_us, chain->params.no_timing);
+        time_meas tm(chain->t_sample_us, chain->params.no_perf);

        for (auto * smpl : chain->samplers) {
            llama_sampler_apply(smpl, cur_p);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17924,7 +17924,7 @@ struct llama_context_params llama_context_default_params() {

 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
    struct llama_sampler_chain_params result = {
-        /*.no_timing                   =*/ false, // TODO: change to true and set explicitly in examples
+        /*.no_perf                     =*/ true,
    };

    return result;
@ -20650,45 +20650,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
    return 0;
 }

-void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) {
-    auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr;
-
-    const llama_timings timings = {
-        /*.t_start_ms   =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms     =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms    =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sampler_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us  : 0.0),
-        /*.t_p_eval_ms  =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms    =*/ 1e-3 * ctx->t_eval_us,
-
-        /*.n_sampler =*/ std::max(0, smpl ? smpl->n_sample  : 0),
-        /*.n_p_eval  =*/ std::max(0, ctx->n_p_eval),
-        /*.n_eval    =*/ std::max(1, ctx->n_eval),
-    };
-
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sampler_ms, timings.n_sampler, timings.t_sampler_ms / timings.n_sampler, 1e3 / timings.t_sampler_ms * timings.n_sampler);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
-}
-
-void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) {
-    ctx->t_start_us  = ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-
-    if (chain) {
-        auto * smpl = (struct llama_sampler_chain *) chain->ctx;
-
-        smpl->t_sample_us = smpl->n_sample = 0;
-    }
-}
-
 const char * llama_print_system_info(void) {
    static std::string s;

@ -20717,7 +20678,68 @@ const char * llama_print_system_info(void) {
    return s.c_str();
 }

-void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                const auto * p = (const struct llama_context *) ctx;
+
+                const double t_start_ms   = 1e-3 * p->t_start_us;
+                const double t_end_ms     = 1.00 * ggml_time_ms();
+                const double t_load_ms    = 1e-3 * p->t_load_us;
+                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
+                const double t_eval_ms    = 1e-3 * p->t_eval_us;
+
+                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
+                const int32_t n_eval    = std::max(1, p->n_eval);
+
+                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
+                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
+                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
+                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                const auto * smpl = (const struct llama_sampler *) ctx;
+                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
+
+                const double t_sampler_ms = 1e-3 * p->t_sample_us;
+
+                const int32_t n_sampler = std::max(0, p->n_sample);
+
+                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_reset(void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                auto * p = (struct llama_context *) ctx;
+
+                p->t_start_us  = ggml_time_us();
+                p->t_eval_us   = p->n_eval = 0;
+                p->t_p_eval_us = p->n_p_eval = 0;
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                auto * smpl = (struct llama_sampler *) ctx;
+                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+
+                p->t_sample_us = p->n_sample = 0;
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
    fprintf(stream, "\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "# Timings #\n");