diff --git a/common/common.cpp b/common/common.cpp index 498215036..b294dd6bf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -828,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_reset_context(lctx); } iparams.model = model; diff --git a/common/sampling.cpp b/common/sampling.cpp index ee290f82a..21403e213 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print_sampler(gsmpl->chain); } if (ctx) { - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a91e7f4bd..931a05286 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -209,7 +209,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 9f7c49492..a6c1b64e9 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) -llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) +llama_perf_print_sampler(smpl) +llama_perf_print_context(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 5d32153fe..8dc35e73d 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_sampler(smpl); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index da7c79253..e7134608a 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index bc7203143..6f0e59dd8 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 032a90136..e9eda9575 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d7db5af72..451b8d9ff 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e9108a9bd..3419da4d4 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 3475bbce5..56135cf2f 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index fff44a499..be6f8d7d7 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -240,8 +240,7 @@ int main(int argc, char ** argv){ LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index bc6301311..827748781 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d3d5ab46f..e53513b41 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index c7d617988..cfb3b5e05 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2049,7 +2049,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 7a360b731..ef20aa86b 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3fdc04394..dabd619ea 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_sampler(smpl); + llama_perf_print_context(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 214e4932b..f82c21ce8 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_print_context(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index cc488f5a6..e63daea9b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1169,9 +1169,7 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - // performance timing information - struct llama_perf_data { - // llama_context + struct llama_perf_data_context { double t_start_ms; double t_load_ms; double t_p_eval_ms; @@ -1179,22 +1177,22 @@ extern "C" { int32_t n_p_eval; int32_t n_eval; + }; - // llama_sampler_chain + struct llama_perf_data_sampler { double t_sample_ms; int32_t n_sample; }; - enum llama_perf_type { - LLAMA_PERF_TYPE_CONTEXT = 0, - LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, - }; + LLAMA_API struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx); + LLAMA_API struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain); - LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_print_context(const struct llama_context * ctx); + LLAMA_API void llama_perf_print_sampler(const struct llama_sampler * chain); - LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); - LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_reset_context(struct llama_context * ctx); + LLAMA_API void llama_perf_reset_sampler(struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 22f7cbbf0..085a8cd3b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20687,87 +20687,68 @@ const char * llama_print_system_info(void) { return s.c_str(); } -llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) { - llama_perf_data data = {}; +struct llama_perf_data_context llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_data_context data = {}; if (ctx == nullptr) { return data; } - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto * p = (const struct llama_context *) ctx; - - data.t_start_ms = 1e-3 * p->t_start_us; - data.t_load_ms = 1e-3 * p->t_load_us; - data.t_p_eval_ms = 1e-3 * p->t_p_eval_us; - data.t_eval_ms = 1e-3 * p->t_eval_us; - data.n_p_eval = std::max(1, p->n_p_eval); - data.n_eval = std::max(1, p->n_eval); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - - data.t_sample_ms = 1e-3 * p->t_sample_us; - data.n_sample = std::max(0, p->n_sample); - } break; - default: - GGML_ABORT("invalid perf type"); - } + data.t_start_ms = 1e-3 * ctx->t_start_us; + data.t_load_ms = 1e-3 * ctx->t_load_us; + data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; + data.t_eval_ms = 1e-3 * ctx->t_eval_us; + data.n_p_eval = std::max(1, ctx->n_p_eval); + data.n_eval = std::max(1, ctx->n_eval); return data; } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto data = llama_perf_get(ctx, type); +struct llama_perf_data_sampler llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_data_sampler data = {}; - const double t_end_ms = 1e-3 * ggml_time_us(); - - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto data = llama_perf_get(ctx, type); - - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); - } break; - default: - GGML_ABORT("invalid perf type"); + if (chain == nullptr) { + return data; } + + const auto * p = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + + return data; } -void llama_perf_reset(void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - auto * p = (struct llama_context *) ctx; +void llama_perf_print_context(const struct llama_context * ctx) { + const auto data = llama_perf_context(ctx); - p->t_start_us = ggml_time_us(); - p->t_eval_us = p->n_eval = 0; - p->t_p_eval_us = p->n_p_eval = 0; - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + const double t_end_ms = 1e-3 * ggml_time_us(); - p->t_sample_us = p->n_sample = 0; - } break; - default: - GGML_ABORT("invalid perf type"); - } + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +} + +void llama_perf_print_sampler(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_reset_context(struct llama_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; +} + +void llama_perf_reset_sampler(struct llama_sampler * chain) { + auto * p = (struct llama_sampler_chain *) chain->ctx; + + p->t_sample_us = p->n_sample = 0; } void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {