From 5545e8ff920ae15d0a17d4b0b3347da8ba7c7707 Mon Sep 17 00:00:00 2001
From: Wenjing Yu <zihao.chen31@gmail.com>
Date: Fri, 26 Jul 2024 16:50:28 -0700
Subject: [PATCH] remove batch-benched

---
 Makefile                                 |   3 +-
 examples/CMakeLists.txt                  |   1 -
 examples/batched-bench/CMakeLists.txt    |   5 -
 examples/batched-bench/README.md         |  51 ------
 examples/batched-bench/batched-bench.cpp | 215 -----------------------
 scripts/pod-llama.sh                     |   8 -
 6 files changed, 1 insertion(+), 282 deletions(-)
 delete mode 100644 examples/batched-bench/CMakeLists.txt
 delete mode 100644 examples/batched-bench/README.md
 delete mode 100644 examples/batched-bench/batched-bench.cpp

diff --git a/Makefile b/Makefile
index 8d66ad994..72ef316ea 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,6 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
-	llama-batched-bench \
 	llama-bench \
 	llama-benchmark-matmult \
 	llama-cli \
@@ -39,7 +38,7 @@ BUILD_TARGETS = \
 
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
-	simple batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \
+	simple save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \
 	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
 
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 8604639e8..a157473c5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
     add_subdirectory(cvector-generator)
-    add_subdirectory(batched-bench)
     add_subdirectory(benchmark)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt
deleted file mode 100644
index 959acaeee..000000000
--- a/examples/batched-bench/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-batched-bench)
-add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
deleted file mode 100644
index 4a07fe6bb..000000000
--- a/examples/batched-bench/README.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# llama.cpp/example/batched-bench
-
-Benchmark the batched decoding performance of `llama.cpp`
-
-## Usage
-
-There are 2 modes of operation:
-
-- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
-- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
-
-```bash
-./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
-
-# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
-
-# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
-
-# custom set of batches
-./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
-```
-
-## Sample results
-
-- `PP` - prompt tokens per batch
-- `TG` - generated tokens per batch
-- `B` - number of batches
-- `N_KV` - required KV cache size
-- `T_PP` - prompt processing time (i.e. time to first token)
-- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
-- `T_TG` - time to generate all batches
-- `S_TG` - text generation speed (`(B*TG)/T_TG`)
-- `T` - total time
-- `S` - total speed (i.e. all tokens / total time)
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
-|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
-|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
-|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
-|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
-|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
-|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
-|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
-|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
-|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
-|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
-|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
deleted file mode 100644
index 718f0a61a..000000000
--- a/examples/batched-bench/batched-bench.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
-        return 1;
-    }
-
-    int is_pp_shared = params.is_pp_shared;
-
-    std::vector<int> n_pp = params.n_pp;
-    std::vector<int> n_tg = params.n_tg;
-    std::vector<int> n_pl = params.n_pl;
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
-
-    // ensure enough sequences are available
-    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    const int32_t n_kv_max = llama_n_ctx(ctx);
-
-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
-
-    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-
-            llama_synchronize(ctx);
-        }
-
-        return true;
-    };
-
-    // warm up
-    {
-        for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
-        }
-
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-    LOG_TEE("\n");
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-
-    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
-        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
-            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
-                const int pp = n_pp[i_pp];
-                const int tg = n_tg[i_tg];
-                const int pl = n_pl[i_pl];
-
-                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
-
-                if (n_ctx_req > n_kv_max) {
-                    continue;
-                }
-
-                llama_batch_clear(batch);
-
-                for (int i = 0; i < pp; ++i) {
-                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                        llama_batch_add(batch, 0, i, { j }, false);
-                    }
-                }
-                batch.logits[batch.n_tokens - 1] = true;
-
-                const auto t_pp_start = ggml_time_us();
-
-                llama_kv_cache_clear(ctx);
-
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
-                    return 1;
-                }
-
-                if (is_pp_shared) {
-                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-                    }
-                }
-
-                const auto t_pp_end = ggml_time_us();
-
-                const auto t_tg_start = ggml_time_us();
-
-                for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
-
-                    for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
-                    }
-
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
-                        return 1;
-                    }
-                }
-
-                const auto t_tg_end = ggml_time_us();
-
-                const int32_t n_kv = n_ctx_req;
-
-                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
-                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
-                const float t    = t_pp + t_tg;
-
-                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
-                const float speed_tg = pl*tg / t_tg;
-                const float speed    = n_kv / t;
-
-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-            }
-        }
-    }
-
-    llama_print_timings(ctx);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
index 44fb11b58..9099987d7 100644
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@@ -183,11 +183,6 @@ if [ "$1" -eq "1" ]; then
 
     make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
 
-    # batched-bench
-    cd /workspace/llama.cpp
-
-    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
-
     # parallel
     cd /workspace/llama.cpp
 
@@ -202,6 +197,3 @@ fi
 #    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 
-# more benches
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
-#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1