From 471e7e1e594aab1ebf41b391cbb8fa618961de57 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Sep 2024 20:50:23 +0300 Subject: [PATCH] llama : llama_perf + option to disable timings during decode ggml-ci --- common/common.cpp | 1 + common/common.h | 1 + common/sampling.cpp | 2 +- common/sampling.h | 1 + include/llama.h | 21 ++++++++++- src/llama.cpp | 88 ++++++++++++++++++++++++++++++--------------- 6 files changed, 83 insertions(+), 31 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c5c4d7508..3203faddd 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2810,6 +2810,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); diff --git a/common/common.h b/common/common.h index d7c08f20a..4e2924a56 100644 --- a/common/common.h +++ b/common/common.h @@ -204,6 +204,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool no_perf = false; // no perf (TODO: add llama_arg) bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch diff --git a/common/sampling.cpp b/common/sampling.cpp index 7806b77e0..c66a4582b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_perf = false; // TODO: control via params + lparams.no_perf = params.no_perf; auto * result = new gpt_sampler { /* .params = */ params, diff --git a/common/sampling.h b/common/sampling.h index 654e0c513..67ad0add4 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -39,6 +39,7 @@ struct gpt_sampler_params { float mirostat_eta = 0.10f; // learning rate bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; + bool no_perf = false; // disable performance metrics std::vector samplers = { GPT_SAMPLER_TYPE_TOP_K, diff --git a/include/llama.h b/include/llama.h index 6334fc30d..e21a62e26 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,7 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - //bool no_perf; // whether to measure performance timings, TODO: implement + bool no_perf; // whether to measure performance timings // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -1168,11 +1168,30 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // + // performance timing information + struct llama_perf_data { + // llama_context + double t_start_ms; + double t_load_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_p_eval; + int32_t n_eval; + + // llama_sampler_chain + double t_sample_ms; + + int32_t n_sample; + }; + enum llama_perf_type { LLAMA_PERF_TYPE_CONTEXT = 0, LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, }; + LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); diff --git a/src/llama.cpp b/src/llama.cpp index f590bcd3b..c68f5e826 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2482,6 +2482,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool no_perf; enum llama_pooling_type pooling_type; @@ -6657,8 +6658,6 @@ static bool llm_load_tensors( bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - auto & hparams = model.hparams; model.split_mode = split_mode; @@ -8589,14 +8588,13 @@ static bool llm_load_tensors( } } - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { + model.t_start_us = ggml_time_us(); + try { llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); @@ -8658,6 +8656,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam return -1; } + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; + return 0; } @@ -17939,6 +17941,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -18149,6 +18152,7 @@ struct llama_context * llama_new_context_with_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -20067,10 +20071,14 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_eval++; } else if (ctx->n_queued_tokens > 1) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_p_eval += ctx->n_queued_tokens; } @@ -20677,39 +20685,61 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { +llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) { + llama_perf_data data = {}; + + if (ctx == nullptr) { + return data; + } + switch (type) { case LLAMA_PERF_TYPE_CONTEXT: { const auto * p = (const struct llama_context *) ctx; - const double t_start_ms = 1e-3 * p->t_start_us; - const double t_end_ms = 1.00 * ggml_time_ms(); - const double t_load_ms = 1e-3 * p->t_load_us; - const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; - const double t_eval_ms = 1e-3 * p->t_eval_us; - - const int32_t n_p_eval = std::max(0, p->n_p_eval); - const int32_t n_eval = std::max(1, p->n_eval); - - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); + data.t_start_ms = 1e-3 * p->t_start_us; + data.t_load_ms = 1e-3 * p->t_load_us;; + data.t_p_eval_ms = 1e-3 * p->t_p_eval_us; + data.t_eval_ms = 1e-3 * p->t_eval_us; + data.n_p_eval = std::max(1, p->n_p_eval); + data.n_eval = std::max(1, p->n_eval); } break; case LLAMA_PERF_TYPE_SAMPLER_CHAIN: { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; + const auto * smpl = (const struct llama_sampler *) ctx; + const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - const double t_sampler_ms = 1e-3 * p->t_sample_us; + data.t_sample_ms = 1e-3 * p->t_sample_us; + data.n_sample = std::max(0, p->n_sample); + } break; + default: + GGML_ABORT("invalid perf type"); + } - const int32_t n_sampler = std::max(0, p->n_sample); + return data; +} + +void llama_perf_print(const void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + const auto data = llama_perf_get(ctx, type); + + const double t_end_ms = 1e-3 * ggml_time_us(); + + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + const auto data = llama_perf_get(ctx, type); LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); } break; default: GGML_ABORT("invalid perf type"); @@ -20729,7 +20759,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) { case LLAMA_PERF_TYPE_SAMPLER_CHAIN: { auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + auto * p = (struct llama_sampler_chain *) smpl->ctx; p->t_sample_us = p->n_sample = 0; } break;