common: allow the warmup to be disabled in llama_init_from_gpt_params

This commit is contained in:
Pierrick HYMBERT 2024-04-10 22:42:04 +02:00
parent 52a8e0640a
commit 831c97efc7
4 changed files with 4 additions and 1 deletions

View file

@ -2194,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
{
if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };

View file

@ -159,6 +159,7 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

View file

@ -136,6 +136,7 @@ int main(int argc, char ** argv) {
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
// init
llama_model * model;

View file

@ -601,6 +601,7 @@ int main(int argc, char ** argv) {
// it will be executed for each node during the graph computation
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
// init
llama_model * model;