llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call

2023-08-16 22:40:53 +02:00 · 2023-08-16 22:40:53 +02:00 · 89a70f78e7
commit 89a70f78e7
parent 5765f90f58
2 changed files with 12 additions and 11 deletions
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -477,7 +477,7 @@ const bool backend_params::blas     = !!ggml_cpu_has_blas();
 // benchmark params
 struct bench_params {
-    int n_prompt ;
+    int n_prompt;
    int n_gen;
    static const std::vector<std::string> & get_fields() {
@ -630,17 +630,18 @@ struct markdown_printer : public printer {
    virtual void print_header(const cmd_params & params) {
        fields = { "model", "backend" };
-        if (backend_params::get_backend() != "CPU") {
+        bool is_cpu_backend = backend_params::get_backend() == "CPU" || backend_params::get_backend() == "BLAS";
        if (!is_cpu_backend) {
            fields.push_back("n_gpu_layers");
        }
        if (params.n_threads.size() > 1 || is_cpu_backend) {
            fields.push_back("n_threads");
        }
        if (params.n_batch.size() > 1) {
            fields.push_back("n_batch");
        }
        if (params.n_threads.size() > 1 || backend_params::get_backend() == "CPU") {
            fields.push_back("n_threads");
        }
        if (params.f32_kv.size() > 1) {
-            fields.push_back("f32_kv");
+            fields.push_back("f16_kv");
        }
        if (params.main_gpu.size() > 1) {
            fields.push_back("main_gpu");
@ -723,9 +724,9 @@ void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int
    std::vector<llama_token> tokens(n_batch, llama_token_bos());
    int n_processed = 0;
    while (n_processed < n_prompt) {
-        int n = std::min(n_prompt - n_processed, n_batch);
+        int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        llama_eval(ctx, tokens.data(), n, n_past + n_processed, n_threads);
+        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
-        n_processed += n;
+        n_processed += n_tokens;
    }
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // memory sizes (calculated for n_batch == 512)
 //
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
+static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
 {
-    static std::map<e_model, size_t> k_sizes = {
+    std::map<e_model, size_t> k_sizes = {
        { MODEL_3B,   ((size_t) n_ctx / 16ull +  92ull) * MB },
        { MODEL_7B,   ((size_t) n_ctx / 16ull + 100ull) * MB },
        { MODEL_13B,  ((size_t) n_ctx / 12ull + 120ull) * MB },