From 96f6dcdeaec2f018161d82f1a22160698867b07c Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Sep 2023 21:41:05 +0200 Subject: [PATCH] llama-bench : keep the same model between tests when possible --- examples/llama-bench/llama-bench.cpp | 84 +++++++++++++++++++++++++--- ggml-cuda.cu | 3 + 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 0bd34aa1a..2fed36ef9 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -367,6 +367,13 @@ struct cmd_params_instance { return mparams; } + bool equal_mparams(const cmd_params_instance & other) const { + return n_gpu_layers == other.n_gpu_layers && + main_gpu == other.main_gpu && + low_vram == other.low_vram && + tensor_split == other.tensor_split; + } + llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); @@ -384,13 +391,13 @@ static std::vector get_cmd_params_instances_int(const cmd_p std::vector instances; for (const auto & m : params.model) - for (const auto & nb : params.n_batch) - for (const auto & fk : params.f32_kv) for (const auto & nl : params.n_gpu_layers) for (const auto & mg : params.main_gpu) - for (const auto & mmq : params.mul_mat_q) for (const auto & lv : params.low_vram) for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) for (const auto & nt : params.n_threads) { cmd_params_instance instance = { /* .model = */ m, @@ -413,6 +420,53 @@ static std::vector get_cmd_params_instances_int(const cmd_p static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; +#if 1 + // this ordering minimizes the number of times that each model needs to be reloaded + for (const auto & m : params.model) + for (const auto & nl : params.n_gpu_layers) + for (const auto & mg : params.main_gpu) + for (const auto & lv : params.low_vram) + for (const auto & ts : params.tensor_split) + for (const auto & nb : params.n_batch) + for (const auto & fk : params.f32_kv) + for (const auto & mmq : params.mul_mat_q) + for (const auto & nt : params.n_threads) { + for (const auto & n_prompt : params.n_prompt) { + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ 0, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .low_vram = */ lv, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + + for (const auto & n_gen : params.n_gen) { + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ 0, + /* .n_gen = */ n_gen, + /* .n_batch = */ nb, + /* .f32_kv = */ fk, + /* .n_threads = */ nt, + /* .n_gpu_layers = */ nl, + /* .main_gpu = */ mg, + /* .mul_mat_q = */ mmq, + /* .low_vram = */ lv, + /* .tensor_split = */ ts, + }; + instances.push_back(instance); + } + } +#else + // this ordering separates the prompt and generation tests for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { continue; @@ -428,6 +482,7 @@ static std::vector get_cmd_params_instances(const cmd_param auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); } +#endif return instances; } @@ -967,12 +1022,22 @@ int main(int argc, char ** argv) { std::vector params_instances = get_cmd_params_instances(params); + llama_model * lmodel = nullptr; + const cmd_params_instance * prev_inst = nullptr; + for (const auto & inst : params_instances) { - // TODO: keep the model between tests when possible - llama_model * lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); - if (lmodel == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); - return 1; + // keep the same model between tests when possible + if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { + if (lmodel) { + llama_free_model(lmodel); + } + + lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + if (lmodel == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } + prev_inst = &inst; } llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); @@ -1009,9 +1074,10 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); - llama_free_model(lmodel); } + llama_free_model(lmodel); + p->print_footer(); llama_backend_free(); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 08428ea3f..f9d8bcc57 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7075,6 +7075,9 @@ void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) { } void ggml_cuda_set_scratch_size(const size_t scratch_size) { + if (scratch_size > g_scratch_size) { + ggml_cuda_free_scratch(); + } g_scratch_size = scratch_size; }