common: allow the warmup to be disabled in llama_init_from_gpt_params

2024-04-10 22:42:04 +02:00 · 2024-04-10 22:42:04 +02:00 · 831c97efc7
commit 831c97efc7
parent 52a8e0640a
4 changed files with 4 additions and 1 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2194,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
    }

-    {
+    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
--- a/common/common.h
+++ b/common/common.h
@ -159,6 +159,7 @@ struct gpt_params {
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
+    bool warmup            = true;  // warmup run

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
--- a/examples/ggml-debug/ggml-debug.cpp
+++ b/examples/ggml-debug/ggml-debug.cpp
@ -136,6 +136,7 @@ int main(int argc, char ** argv) {
    // it will be executed for each node during the graph computation
    params.cb_eval = ggml_debug;
    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;

    // init
    llama_model * model;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -601,6 +601,7 @@ int main(int argc, char ** argv) {
    // it will be executed for each node during the graph computation
    params.cb_eval = ik_collect_imatrix;
    params.cb_eval_user_data = NULL;
+    params.warmup = false;

    // init
    llama_model * model;