diff --git a/common/common.cpp b/common/common.cpp
index 2a51649a5..639430131 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2533,7 +2533,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_reset_timings(lctx, nullptr);
+        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
     }
 
     iparams.model   = model;
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 9964501da..553aefbf4 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -153,7 +153,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
-    lparams.no_timing = false;
+    lparams.no_perf = false; // TODO: control via params
 
     auto * result = new gpt_sampler {
         /* .params = */ params,
@@ -270,8 +270,15 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
     return gsmpl->prev.rat(0);
 }
 
-void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
-    llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr);
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
+    // TODO: measure grammar performance
+
+    if (gsmpl) {
+        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    }
+    if (ctx) {
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    }
 }
 
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
diff --git a/common/sampling.h b/common/sampling.h
index d88038204..fa691cda2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -62,6 +62,8 @@ struct gpt_sampler_params {
 //  - grammar support
 //  - custom sampler logic based on the parameters
 //
+// TODO: measure grammar performance
+//
 struct gpt_sampler;
 
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
@@ -75,11 +77,9 @@ void gpt_sampler_reset (struct gpt_sampler * gsmpl);
 
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
 
-//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p);
-
 llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
 
-void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
 
 // extended sampling implementation:
 //
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index b02ef74cf..b043c74cc 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -210,7 +210,8 @@ int main(int argc, char ** argv) {
         }
     }
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_batch_free(batch);
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index b6e98fcc3..7ab7eed79 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -231,7 +231,9 @@ int main(int argc, char ** argv) {
     LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 4b288b460..e5e0872b1 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -307,8 +307,10 @@ int main(int argc, char ** argv) {
         if (notArray) fprintf(stdout, "\n}\n");
     }
 
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
     // clean up
-    llama_print_timings(ctx, nullptr);
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 166ca4b7d..aea15c864 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -181,7 +181,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index b402abbb8..4e801c69d 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -171,8 +171,12 @@ int main(int argc, char * argv[]) {
 
     auto sparams = llama_sampler_chain_default_params();
 
+    sparams.no_perf = false;
+
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
     // ### Embedding/Representation ###
     // samples taken from: https://github.com/ContextualAI/gritlm#basic
     {
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1c7f53505..107f8c885 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -638,7 +638,8 @@ int main(int argc, char ** argv) {
 
     g_collector.save_imatrix();
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 3895b586e..1ebc0b324 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -81,7 +81,7 @@ static void write_logfile(
     yaml_dump_string_multiline(logfile, "output", output.c_str());
     yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
 
-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -93,7 +93,7 @@ static void sigint_handler(int signo) {
         } else {
             console::cleanup();
             printf("\n");
-            gpt_print_timings(*g_ctx, *g_smpl);
+            gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
             _exit(130);
         }
@@ -634,7 +634,8 @@ int main(int argc, char ** argv) {
         fflush(stdout);
     }
 
-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     llama_free(ctx);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 1385634dd..d7db5af72 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_print_timings(ctx, nullptr);
+        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
         llama_free(ctx);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 63a75c4a3..4d7ccc91f 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -310,7 +310,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_print_timings(ctx_llava->ctx_llama, nullptr);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_print_timings(ctx_llava->ctx_llama, nullptr);
+            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 15f258b91..237da9429 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama, nullptr);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 8b461555b..c2e931c65 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -467,7 +467,8 @@ int main(int argc, char ** argv) {
     LOG_TEE("n_predict = %d\n", n_predict);
     LOG_TEE("n_accept  = %d\n", n_accept);
 
-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index da3583f3c..071400b7e 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -238,8 +238,9 @@ int main(int argc, char ** argv){
     LOG_TEE("n_accept     = %d\n", n_accept);
     LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ntarget:\n");
-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\ntarget:\n\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
 
     gpt_sampler_free(smpl);
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 85dea9782..42058d41d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -93,7 +93,7 @@ static void write_logfile(
     yaml_dump_string_multiline(logfile, "output", output.c_str());
     yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
 
-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -106,7 +106,7 @@ static void sigint_handler(int signo) {
         } else {
             console::cleanup();
             printf("\n");
-            gpt_print_timings(*g_ctx, *g_smpl);
+            gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
             _exit(130);
         }
@@ -929,7 +929,8 @@ int main(int argc, char ** argv) {
         llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
-    gpt_print_timings(ctx, smpl);
+    LOG_TEE("\n");
+    gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
     gpt_sampler_free(smpl);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7422042db..c331c0f28 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -414,7 +414,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_print_timings(ctx, nullptr);
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     llama_batch_free(batch);
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 92c71c5a1..ff8d0302f 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -259,7 +259,8 @@ int main(int argc, char ** argv) {
     LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 987236ab6..2ca43f125 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -76,7 +76,7 @@ static void write_logfile(
     fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
     yaml_dump_vector_float(logfile, "probs", results.probs);
 
-    llama_dump_timing_info_yaml(logfile, ctx);
+    llama_perf_dump_yaml(logfile, ctx);
     fclose(logfile);
 }
 
@@ -2048,7 +2048,8 @@ int main(int argc, char ** argv) {
         results = perplexity(ctx, params, n_ctx);
     }
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 6089344a0..7eb947650 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -293,8 +293,10 @@ int main(int argc, char ** argv) {
         }
     }
 
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+
     // clean up
-    llama_print_timings(ctx, nullptr);
     llama_batch_free(query_batch);
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index e5dfeb2f4..8a0ad43ad 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -57,8 +57,12 @@ int main(int argc, char ** argv) {
 
     auto sparams = llama_sampler_chain_default_params();
 
+    sparams.no_perf = false;
+
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
@@ -153,7 +157,9 @@ int main(int argc, char ** argv) {
     LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    llama_print_timings(ctx, nullptr);
+    LOG_TEE("\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 037d5d34b..55c6bda70 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -613,12 +613,12 @@ int main(int argc, char ** argv) {
     LOG_TEE("n_accept  = %d\n", n_accept);
     LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ndraft:\n");
+    LOG_TEE("\ndraft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    gpt_print_timings(ctx_dft, nullptr);
+    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
 
-    LOG_TEE("\ntarget:\n");
-    gpt_print_timings(ctx_tgt, smpl);
+    LOG_TEE("\ntarget:\n\n");
+    gpt_perf_print(ctx_tgt, smpl);
 
     gpt_sampler_free(smpl);
     for (int s = 0; s < n_seq_dft; ++s) {
diff --git a/include/llama.h b/include/llama.h
index 29c216f2d..be01dadfb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -342,6 +342,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
+      //bool no_perf;     // whether to measure performance timings, TODO: implement
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -371,23 +372,9 @@ extern "C" {
     } llama_logit_bias;
 
     typedef struct llama_sampler_chain_params {
-        bool no_timing; // whether to measure performance timings
+        bool no_perf; // whether to measure performance timings
     } llama_sampler_chain_params;
 
-    // performance timing information
-    struct llama_timings {
-        double t_start_ms;
-        double t_end_ms;
-        double t_load_ms;
-        double t_sampler_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-
-        int32_t n_sampler;
-        int32_t n_p_eval;
-        int32_t n_eval;
-    };
-
     // used in chat template
     typedef struct llama_chat_message {
         const char * role;
@@ -1121,13 +1108,6 @@ extern "C" {
     //  Returns the split_prefix length.
     LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
 
-    // Performance information
-    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
-
-    // note: requires llama_sampler_chain. how to prevent misuse?
-    LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain);
-    LLAMA_API void llama_reset_timings(      struct llama_context * ctx,       struct llama_sampler * chain);
-
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
 
@@ -1135,7 +1115,21 @@ extern "C" {
     // If this is not called, or NULL is supplied, everything is output on stderr.
     LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
 
-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+    //
+    // Performance utils
+    //
+    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    //
+
+    enum llama_perf_type {
+        LLAMA_PERF_TYPE_CONTEXT       = 0,
+        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
+    };
+
+    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
+    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+
+    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 
 #ifdef __cplusplus
 }
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 2aa4981ce..15d5b5f8a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -511,7 +511,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
     /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
         auto * chain = (llama_sampler_chain *) smpl->ctx;
 
-        time_meas tm(chain->t_sample_us, chain->params.no_timing);
+        time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
         for (auto * smpl : chain->samplers) {
             llama_sampler_accept(smpl, token);
@@ -522,7 +522,7 @@ static struct llama_sampler_i llama_sampler_chain_i = {
     /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
         auto * chain = (llama_sampler_chain *) smpl->ctx;
 
-        time_meas tm(chain->t_sample_us, chain->params.no_timing);
+        time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
         for (auto * smpl : chain->samplers) {
             llama_sampler_apply(smpl, cur_p);
diff --git a/src/llama.cpp b/src/llama.cpp
index f5e01004f..c67f3638d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17924,7 +17924,7 @@ struct llama_context_params llama_context_default_params() {
 
 struct llama_sampler_chain_params llama_sampler_chain_default_params() {
     struct llama_sampler_chain_params result = {
-        /*.no_timing                   =*/ false, // TODO: change to true and set explicitly in examples
+        /*.no_perf                     =*/ true,
     };
 
     return result;
@@ -20650,45 +20650,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
     return 0;
 }
 
-void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) {
-    auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr;
-
-    const llama_timings timings = {
-        /*.t_start_ms   =*/ 1e-3 * ctx->t_start_us,
-        /*.t_end_ms     =*/ 1.00 * ggml_time_ms(),
-        /*.t_load_ms    =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sampler_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us  : 0.0),
-        /*.t_p_eval_ms  =*/ 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms    =*/ 1e-3 * ctx->t_eval_us,
-
-        /*.n_sampler =*/ std::max(0, smpl ? smpl->n_sample  : 0),
-        /*.n_p_eval  =*/ std::max(0, ctx->n_p_eval),
-        /*.n_eval    =*/ std::max(1, ctx->n_eval),
-    };
-
-    LLAMA_LOG_INFO("\n");
-    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_sampler_ms, timings.n_sampler, timings.t_sampler_ms / timings.n_sampler, 1e3 / timings.t_sampler_ms * timings.n_sampler);
-    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
-    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
-    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
-}
-
-void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) {
-    ctx->t_start_us  = ggml_time_us();
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-
-    if (chain) {
-        auto * smpl = (struct llama_sampler_chain *) chain->ctx;
-
-        smpl->t_sample_us = smpl->n_sample = 0;
-    }
-}
-
 const char * llama_print_system_info(void) {
     static std::string s;
 
@@ -20717,7 +20678,68 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+void llama_perf_print(const void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                const auto * p = (const struct llama_context *) ctx;
+
+                const double t_start_ms   = 1e-3 * p->t_start_us;
+                const double t_end_ms     = 1.00 * ggml_time_ms();
+                const double t_load_ms    = 1e-3 * p->t_load_us;
+                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
+                const double t_eval_ms    = 1e-3 * p->t_eval_us;
+
+                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
+                const int32_t n_eval    = std::max(1, p->n_eval);
+
+                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
+                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
+                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
+                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                const auto * smpl = (const struct llama_sampler *) ctx;
+                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
+
+                const double t_sampler_ms = 1e-3 * p->t_sample_us;
+
+                const int32_t n_sampler = std::max(0, p->n_sample);
+
+                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_reset(void * ctx, enum llama_perf_type type) {
+    switch (type) {
+        case LLAMA_PERF_TYPE_CONTEXT:
+            {
+                auto * p = (struct llama_context *) ctx;
+
+                p->t_start_us  = ggml_time_us();
+                p->t_eval_us   = p->n_eval = 0;
+                p->t_p_eval_us = p->n_p_eval = 0;
+            } break;
+        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
+            {
+                auto * smpl = (struct llama_sampler *) ctx;
+                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+
+                p->t_sample_us = p->n_sample = 0;
+            } break;
+        default:
+            GGML_ABORT("invalid perf type");
+    }
+}
+
+void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "\n");
     fprintf(stream, "###########\n");
     fprintf(stream, "# Timings #\n");