diff --git a/common/common.cpp b/common/common.cpp index f3001b41f..dda514785 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2194,7 +2194,7 @@ std::tuple llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - { + if (params.warmup) { LOG("warming up the model with an empty run\n"); std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; diff --git a/common/common.h b/common/common.h index dafc7b1ce..65272b0ba 100644 --- a/common/common.h +++ b/common/common.h @@ -159,6 +159,7 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool warmup = true; // warmup run std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/examples/ggml-debug/ggml-debug.cpp b/examples/ggml-debug/ggml-debug.cpp index 6d7bd2129..aa2df8c64 100644 --- a/examples/ggml-debug/ggml-debug.cpp +++ b/examples/ggml-debug/ggml-debug.cpp @@ -136,6 +136,7 @@ int main(int argc, char ** argv) { // it will be executed for each node during the graph computation params.cb_eval = ggml_debug; params.cb_eval_user_data = &cb_data; + params.warmup = false; // init llama_model * model; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 10c0f08a2..ff624c539 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -601,6 +601,7 @@ int main(int argc, char ** argv) { // it will be executed for each node during the graph computation params.cb_eval = ik_collect_imatrix; params.cb_eval_user_data = NULL; + params.warmup = false; // init llama_model * model;