llama-bench : keep the same model between tests when possible

2023-09-21 21:41:05 +02:00 · 2023-09-21 21:41:05 +02:00 · 96f6dcdeae
commit 96f6dcdeae
parent f28e4953a8
2 changed files with 78 additions and 9 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -367,6 +367,13 @@ struct cmd_params_instance {
        return mparams;
    }
    bool equal_mparams(const cmd_params_instance & other) const {
        return n_gpu_layers == other.n_gpu_layers &&
               main_gpu == other.main_gpu &&
               low_vram == other.low_vram &&
               tensor_split == other.tensor_split;
    }
    llama_context_params to_llama_cparams() const {
        llama_context_params cparams = llama_context_default_params();
@ -384,13 +391,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    std::vector<cmd_params_instance> instances;
    for (const auto & m : params.model)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & mg : params.main_gpu)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & lv : params.low_vram)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
@ -413,6 +420,53 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
    std::vector<cmd_params_instance> instances;
 #if 1
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & mg : params.main_gpu)
    for (const auto & lv : params.low_vram)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            cmd_params_instance instance = {
                /* .model        = */ m,
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
                /* .mul_mat_q    = */ mmq,
                /* .low_vram     = */ lv,
                /* .tensor_split = */ ts,
            };
            instances.push_back(instance);
        }
        for (const auto & n_gen : params.n_gen) {
            cmd_params_instance instance = {
                /* .model        = */ m,
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
                /* .mul_mat_q    = */ mmq,
                /* .low_vram     = */ lv,
                /* .tensor_split = */ ts,
            };
            instances.push_back(instance);
        }
    }
 #else
    // this ordering separates the prompt and generation tests
    for (const auto & n_prompt : params.n_prompt) {
        if (n_prompt == 0) {
            continue;
@ -428,6 +482,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
    }
 #endif
    return instances;
 }
@ -967,12 +1022,22 @@ int main(int argc, char ** argv) {
    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
    llama_model * lmodel = nullptr;
    const cmd_params_instance * prev_inst = nullptr;
    for (const auto & inst : params_instances) {
-        // TODO: keep the model between tests when possible
+        // keep the same model between tests when possible
-        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
+        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
-        if (lmodel == NULL) {
+            if (lmodel) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+                llama_free_model(lmodel);
-            return 1;
+            }
            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
            if (lmodel == NULL) {
                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
                return 1;
            }
            prev_inst = &inst;
        }
        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
@ -1009,9 +1074,10 @@ int main(int argc, char ** argv) {
        llama_print_timings(ctx);
        llama_free(ctx);
        llama_free_model(lmodel);
    }
    llama_free_model(lmodel);
    p->print_footer();
    llama_backend_free();
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -7075,6 +7075,9 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
 }
 void ggml_cuda_set_scratch_size(const size_t scratch_size) {
    if (scratch_size > g_scratch_size) {
        ggml_cuda_free_scratch();
    }
    g_scratch_size = scratch_size;
 }