From 96f6dcdeaec2f018161d82f1a22160698867b07c Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 21 Sep 2023 21:41:05 +0200
Subject: [PATCH] llama-bench : keep the same model between tests when possible

---
 examples/llama-bench/llama-bench.cpp | 84 +++++++++++++++++++++++++---
 ggml-cuda.cu                         |  3 +
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 0bd34aa1a..2fed36ef9 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -367,6 +367,13 @@ struct cmd_params_instance {
         return mparams;
     }
 
+    bool equal_mparams(const cmd_params_instance & other) const {
+        return n_gpu_layers == other.n_gpu_layers &&
+               main_gpu == other.main_gpu &&
+               low_vram == other.low_vram &&
+               tensor_split == other.tensor_split;
+    }
+
     llama_context_params to_llama_cparams() const {
         llama_context_params cparams = llama_context_default_params();
 
@@ -384,13 +391,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
     std::vector<cmd_params_instance> instances;
 
     for (const auto & m : params.model)
-    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
     for (const auto & nl : params.n_gpu_layers)
     for (const auto & mg : params.main_gpu)
-    for (const auto & mmq : params.mul_mat_q)
     for (const auto & lv : params.low_vram)
     for (const auto & ts : params.tensor_split)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & mmq : params.mul_mat_q)
     for (const auto & nt : params.n_threads) {
         cmd_params_instance instance = {
             /* .model        = */ m,
@@ -413,6 +420,53 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
     std::vector<cmd_params_instance> instances;
 
+#if 1
+    // this ordering minimizes the number of times that each model needs to be reloaded
+    for (const auto & m : params.model)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & lv : params.low_vram)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & nt : params.n_threads) {
+        for (const auto & n_prompt : params.n_prompt) {
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ n_prompt,
+                /* .n_gen        = */ 0,
+                /* .n_batch      = */ nb,
+                /* .f32_kv       = */ fk,
+                /* .n_threads    = */ nt,
+                /* .n_gpu_layers = */ nl,
+                /* .main_gpu     = */ mg,
+                /* .mul_mat_q    = */ mmq,
+                /* .low_vram     = */ lv,
+                /* .tensor_split = */ ts,
+            };
+            instances.push_back(instance);
+        }
+
+        for (const auto & n_gen : params.n_gen) {
+            cmd_params_instance instance = {
+                /* .model        = */ m,
+                /* .n_prompt     = */ 0,
+                /* .n_gen        = */ n_gen,
+                /* .n_batch      = */ nb,
+                /* .f32_kv       = */ fk,
+                /* .n_threads    = */ nt,
+                /* .n_gpu_layers = */ nl,
+                /* .main_gpu     = */ mg,
+                /* .mul_mat_q    = */ mmq,
+                /* .low_vram     = */ lv,
+                /* .tensor_split = */ ts,
+            };
+            instances.push_back(instance);
+        }
+    }
+#else
+    // this ordering separates the prompt and generation tests
     for (const auto & n_prompt : params.n_prompt) {
         if (n_prompt == 0) {
             continue;
@@ -428,6 +482,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
         auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
         instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
     }
+#endif
 
     return instances;
 }
@@ -967,12 +1022,22 @@ int main(int argc, char ** argv) {
 
     std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
 
+    llama_model * lmodel = nullptr;
+    const cmd_params_instance * prev_inst = nullptr;
+
     for (const auto & inst : params_instances) {
-        // TODO: keep the model between tests when possible
-        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
-        if (lmodel == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
-            return 1;
+        // keep the same model between tests when possible
+        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
+            if (lmodel) {
+                llama_free_model(lmodel);
+            }
+
+            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+            if (lmodel == NULL) {
+                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+                return 1;
+            }
+            prev_inst = &inst;
         }
 
         llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
@@ -1009,9 +1074,10 @@ int main(int argc, char ** argv) {
         llama_print_timings(ctx);
 
         llama_free(ctx);
-        llama_free_model(lmodel);
     }
 
+    llama_free_model(lmodel);
+
     p->print_footer();
 
     llama_backend_free();
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 08428ea3f..f9d8bcc57 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7075,6 +7075,9 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
 }
 
 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
     g_scratch_size = scratch_size;
 }