llama : llama_perf + option to disable timings during decode (#9355)
* llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
bd35cb0ae3
commit
0abc6a2c25
23 changed files with 135 additions and 91 deletions
|
@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.prompt = value;
|
||||
}
|
||||
));
|
||||
add_opt(llama_arg(
|
||||
{"--no-perf"},
|
||||
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
[](gpt_params & params) {
|
||||
params.no_perf = true;
|
||||
params.sparams.no_perf = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_PERF"));
|
||||
add_opt(llama_arg(
|
||||
{"-f", "--file"}, "FNAME",
|
||||
"a file containing the prompt (default: none)",
|
||||
|
|
|
@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_reset(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
|
@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
cparams.no_perf = params.no_perf;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
|
|
|
@ -124,6 +124,7 @@ struct gpt_sampler_params {
|
|||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
|
@ -246,6 +247,7 @@ struct gpt_params {
|
|||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
|
|
|
@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
|||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = false; // TODO: control via params
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
auto * result = new gpt_sampler {
|
||||
/* .params = */ params,
|
||||
|
@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|||
// TODO: measure grammar performance
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue