diff --git a/common/common.cpp b/common/common.cpp index 2a51649a5..639430131 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2533,7 +2533,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_reset_timings(lctx, nullptr); + llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); } iparams.model = model; diff --git a/common/sampling.cpp b/common/sampling.cpp index 9964501da..553aefbf4 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -153,7 +153,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_timing = false; + lparams.no_perf = false; // TODO: control via params auto * result = new gpt_sampler { /* .params = */ params, @@ -270,8 +270,15 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { return gsmpl->prev.rat(0); } -void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { - llama_print_timings(ctx, gsmpl ? gsmpl->chain : nullptr); +void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { + // TODO: measure grammar performance + + if (gsmpl) { + llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + } + if (ctx) { + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + } } llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { diff --git a/common/sampling.h b/common/sampling.h index d88038204..fa691cda2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -62,6 +62,8 @@ struct gpt_sampler_params { // - grammar support // - custom sampler logic based on the parameters // +// TODO: measure grammar performance +// struct gpt_sampler; struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); @@ -75,11 +77,9 @@ void gpt_sampler_reset (struct gpt_sampler * gsmpl); llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); -//llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_token_data_array * cur_p); - llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); -void gpt_print_timings(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); +void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); // extended sampling implementation: // diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b02ef74cf..b043c74cc 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -210,7 +210,8 @@ int main(int argc, char ** argv) { } } - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_batch_free(batch); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index b6e98fcc3..7ab7eed79 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -231,7 +231,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx, smpl); + LOG_TEE("\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 4b288b460..e5e0872b1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -307,8 +307,10 @@ int main(int argc, char ** argv) { if (notArray) fprintf(stdout, "\n}\n"); } + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + // clean up - llama_print_timings(ctx, nullptr); llama_batch_free(batch); llama_free(ctx); llama_free_model(model); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 166ca4b7d..aea15c864 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -181,7 +181,8 @@ int main(int argc, char ** argv) { return 1; } - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); llama_free_model(model); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index b402abbb8..4e801c69d 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -171,8 +171,12 @@ int main(int argc, char * argv[]) { auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + // ### Embedding/Representation ### // samples taken from: https://github.com/ContextualAI/gritlm#basic { diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1c7f53505..107f8c885 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -638,7 +638,8 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); llama_free_model(model); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 3895b586e..1ebc0b324 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -81,7 +81,7 @@ static void write_logfile( yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -93,7 +93,7 @@ static void sigint_handler(int signo) { } else { console::cleanup(); printf("\n"); - gpt_print_timings(*g_ctx, *g_smpl); + gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } @@ -634,7 +634,8 @@ int main(int argc, char ** argv) { fflush(stdout); } - gpt_print_timings(ctx, smpl); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 1385634dd..d7db5af72 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_print_timings(ctx, nullptr); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 63a75c4a3..4d7ccc91f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -310,7 +310,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama, nullptr); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -327,7 +327,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama, nullptr); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 15f258b91..237da9429 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_print_timings(ctx_llava->ctx_llama, nullptr); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 8b461555b..c2e931c65 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -467,7 +467,8 @@ int main(int argc, char ** argv) { LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_accept = %d\n", n_accept); - gpt_print_timings(ctx, smpl); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index da3583f3c..071400b7e 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -238,8 +238,9 @@ int main(int argc, char ** argv){ LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ntarget:\n"); - gpt_print_timings(ctx, smpl); + LOG_TEE("\ntarget:\n\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); gpt_sampler_free(smpl); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 85dea9782..42058d41d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -93,7 +93,7 @@ static void write_logfile( yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -106,7 +106,7 @@ static void sigint_handler(int signo) { } else { console::cleanup(); printf("\n"); - gpt_print_timings(*g_ctx, *g_smpl); + gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } @@ -929,7 +929,8 @@ int main(int argc, char ** argv) { llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - gpt_print_timings(ctx, smpl); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); gpt_sampler_free(smpl); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7422042db..c331c0f28 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -414,7 +414,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_print_timings(ctx, nullptr); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 92c71c5a1..ff8d0302f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -259,7 +259,8 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 987236ab6..2ca43f125 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -76,7 +76,7 @@ static void write_logfile( fprintf(logfile, "ppl_value: %f\n", results.ppl_value); yaml_dump_vector_float(logfile, "probs", results.probs); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -2048,7 +2048,8 @@ int main(int argc, char ** argv) { results = perplexity(ctx, params, n_ctx); } - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 6089344a0..7eb947650 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -293,8 +293,10 @@ int main(int argc, char ** argv) { } } + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + // clean up - llama_print_timings(ctx, nullptr); llama_batch_free(query_batch); llama_free(ctx); llama_free_model(model); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index e5dfeb2f4..8a0ad43ad 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -57,8 +57,12 @@ int main(int argc, char ** argv) { auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + // tokenize the prompt std::vector tokens_list; @@ -153,7 +157,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx, nullptr); + LOG_TEE("\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 037d5d34b..55c6bda70 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -613,12 +613,12 @@ int main(int argc, char ** argv) { LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ndraft:\n"); + LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - gpt_print_timings(ctx_dft, nullptr); + llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); - LOG_TEE("\ntarget:\n"); - gpt_print_timings(ctx_tgt, smpl); + LOG_TEE("\ntarget:\n\n"); + gpt_perf_print(ctx_tgt, smpl); gpt_sampler_free(smpl); for (int s = 0; s < n_seq_dft; ++s) { diff --git a/include/llama.h b/include/llama.h index 29c216f2d..be01dadfb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -342,6 +342,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + //bool no_perf; // whether to measure performance timings, TODO: implement // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -371,23 +372,9 @@ extern "C" { } llama_logit_bias; typedef struct llama_sampler_chain_params { - bool no_timing; // whether to measure performance timings + bool no_perf; // whether to measure performance timings } llama_sampler_chain_params; - // performance timing information - struct llama_timings { - double t_start_ms; - double t_end_ms; - double t_load_ms; - double t_sampler_ms; - double t_p_eval_ms; - double t_eval_ms; - - int32_t n_sampler; - int32_t n_p_eval; - int32_t n_eval; - }; - // used in chat template typedef struct llama_chat_message { const char * role; @@ -1121,13 +1108,6 @@ extern "C" { // Returns the split_prefix length. LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); - // Performance information - LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); - - // note: requires llama_sampler_chain. how to prevent misuse? - LLAMA_API void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain); - LLAMA_API void llama_reset_timings( struct llama_context * ctx, struct llama_sampler * chain); - // Print system information LLAMA_API const char * llama_print_system_info(void); @@ -1135,7 +1115,21 @@ extern "C" { // If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); - LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + // + // Performance utils + // + // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. + // + + enum llama_perf_type { + LLAMA_PERF_TYPE_CONTEXT = 0, + LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, + }; + + LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + + LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); #ifdef __cplusplus } diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 2aa4981ce..15d5b5f8a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -511,7 +511,7 @@ static struct llama_sampler_i llama_sampler_chain_i = { /* .accept = */ [](struct llama_sampler * smpl, llama_token token) { auto * chain = (llama_sampler_chain *) smpl->ctx; - time_meas tm(chain->t_sample_us, chain->params.no_timing); + time_meas tm(chain->t_sample_us, chain->params.no_perf); for (auto * smpl : chain->samplers) { llama_sampler_accept(smpl, token); @@ -522,7 +522,7 @@ static struct llama_sampler_i llama_sampler_chain_i = { /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * chain = (llama_sampler_chain *) smpl->ctx; - time_meas tm(chain->t_sample_us, chain->params.no_timing); + time_meas tm(chain->t_sample_us, chain->params.no_perf); for (auto * smpl : chain->samplers) { llama_sampler_apply(smpl, cur_p); diff --git a/src/llama.cpp b/src/llama.cpp index f5e01004f..c67f3638d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17924,7 +17924,7 @@ struct llama_context_params llama_context_default_params() { struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params result = { - /*.no_timing =*/ false, // TODO: change to true and set explicitly in examples + /*.no_perf =*/ true, }; return result; @@ -20650,45 +20650,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int return 0; } -void llama_print_timings(const struct llama_context * ctx, const struct llama_sampler * chain) { - auto * smpl = chain ? (const struct llama_sampler_chain *) chain->ctx : nullptr; - - const llama_timings timings = { - /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, - /*.t_end_ms =*/ 1.00 * ggml_time_ms(), - /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, - /*.t_sampler_ms =*/ 1e-3 * (smpl ? smpl->t_sample_us : 0.0), - /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, - /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, - - /*.n_sampler =*/ std::max(0, smpl ? smpl->n_sample : 0), - /*.n_p_eval =*/ std::max(0, ctx->n_p_eval), - /*.n_eval =*/ std::max(1, ctx->n_eval), - }; - - LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_sampler_ms, timings.n_sampler, timings.t_sampler_ms / timings.n_sampler, 1e3 / timings.t_sampler_ms * timings.n_sampler); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); -} - -void llama_reset_timings(struct llama_context * ctx, struct llama_sampler * chain) { - ctx->t_start_us = ggml_time_us(); - ctx->t_eval_us = ctx->n_eval = 0; - ctx->t_p_eval_us = ctx->n_p_eval = 0; - - if (chain) { - auto * smpl = (struct llama_sampler_chain *) chain->ctx; - - smpl->t_sample_us = smpl->n_sample = 0; - } -} - const char * llama_print_system_info(void) { static std::string s; @@ -20717,7 +20678,68 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { +void llama_perf_print(const void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + const auto * p = (const struct llama_context *) ctx; + + const double t_start_ms = 1e-3 * p->t_start_us; + const double t_end_ms = 1.00 * ggml_time_ms(); + const double t_load_ms = 1e-3 * p->t_load_us; + const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; + const double t_eval_ms = 1e-3 * p->t_eval_us; + + const int32_t n_p_eval = std::max(0, p->n_p_eval); + const int32_t n_eval = std::max(1, p->n_eval); + + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + const auto * smpl = (const struct llama_sampler *) ctx; + const auto * p = (const struct llama_sampler_chain *) smpl->ctx; + + const double t_sampler_ms = 1e-3 * p->t_sample_us; + + const int32_t n_sampler = std::max(0, p->n_sample); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); + } break; + default: + GGML_ABORT("invalid perf type"); + } +} + +void llama_perf_reset(void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + auto * p = (struct llama_context *) ctx; + + p->t_start_us = ggml_time_us(); + p->t_eval_us = p->n_eval = 0; + p->t_p_eval_us = p->n_p_eval = 0; + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + auto * smpl = (struct llama_sampler *) ctx; + auto * p = (struct llama_sampler_chain *) smpl->ctx; + + p->t_sample_us = p->n_sample = 0; + } break; + default: + GGML_ABORT("invalid perf type"); + } +} + +void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { fprintf(stream, "\n"); fprintf(stream, "###########\n"); fprintf(stream, "# Timings #\n");