Merge pull request #8 from zihaoccc/cleanup5

remove batch-benched
2024-07-26 16:51:16 -07:00 · 2024-07-26 16:51:16 -07:00 · 0f2350dcb6
commit 0f2350dcb6
parent 5630607f2b 5545e8ff92
6 changed files with 1 additions and 282 deletions
--- a/3
+++ b/3
@ -1,7 +1,6 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	libllava.a \
 	llama-batched-bench \
 	llama-bench \
 	llama-benchmark-matmult \
 	llama-cli \
@ -39,7 +38,7 @@ BUILD_TARGETS = \
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
-	simple batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \
+	simple save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \
 	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
    add_subdirectory(batched-bench)
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@ -1,5 +0,0 @@
 set(TARGET llama-batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -1,51 +0,0 @@
 # llama.cpp/example/batched-bench
 Benchmark the batched decoding performance of `llama.cpp`
 ## Usage
 There are 2 modes of operation:
 - `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 ```bash
 ./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
 ./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
 ./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 # custom set of batches
 ./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 ## Sample results
 - `PP` - prompt tokens per batch
 - `TG` - generated tokens per batch
 - `B` - number of batches
 - `N_KV` - required KV cache size
 - `T_PP` - prompt processing time (i.e. time to first token)
 - `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
 - `T_TG` - time to generate all batches
 - `S_TG` - text generation speed (`(B*TG)/T_TG`)
 - `T` - total time
 - `S` - total speed (i.e. all tokens / total time)
 |    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
 |-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
 |   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
 |   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
 |   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
 |   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
 |   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
 |   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
 |   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
 |   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
 |   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -1,215 +0,0 @@
 #include "common.h"
 #include "llama.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 // mutates the input string
 static std::vector<int> parse_list(char * p) {
    std::vector<int> ret;
    char * q = p;
    while (*p) {
        if (*p == ',') {
            *p = '\0';
            ret.push_back(std::atoi(q));
            q = p + 1;
        }
        ++p;
    }
    ret.push_back(std::atoi(q));
    return ret;
 }
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    int is_pp_shared = params.is_pp_shared;
    std::vector<int> n_pp = params.n_pp;
    std::vector<int> n_tg = params.n_tg;
    std::vector<int> n_pl = params.n_pl;
    // init LLM
    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
    // ensure enough sequences are available
    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    const int32_t n_kv_max = llama_n_ctx(ctx);
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
    // decode in batches of ctx_params.n_batch tokens
    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
            llama_batch batch_view = {
                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
                0, 0, 0, // unused
            };
            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }
            llama_synchronize(ctx);
        }
        return true;
    };
    // warm up
    {
        for (int i = 0; i < 16; ++i) {
            llama_batch_add(batch, 0, i, { 0 }, false);
        }
        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }
    LOG_TEE("\n");
    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
                const int pp = n_pp[i_pp];
                const int tg = n_tg[i_tg];
                const int pl = n_pl[i_pl];
                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
                if (n_ctx_req > n_kv_max) {
                    continue;
                }
                llama_batch_clear(batch);
                for (int i = 0; i < pp; ++i) {
                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
                        llama_batch_add(batch, 0, i, { j }, false);
                    }
                }
                batch.logits[batch.n_tokens - 1] = true;
                const auto t_pp_start = ggml_time_us();
                llama_kv_cache_clear(ctx);
                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
                    return 1;
                }
                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                    }
                }
                const auto t_pp_end = ggml_time_us();
                const auto t_tg_start = ggml_time_us();
                for (int i = 0; i < tg; ++i) {
                    llama_batch_clear(batch);
                    for (int j = 0; j < pl; ++j) {
                        llama_batch_add(batch, 0, pp + i, { j }, true);
                    }
                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                        LOG_TEE("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
                }
                const auto t_tg_end = ggml_time_us();
                const int32_t n_kv = n_ctx_req;
                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
                const float t    = t_pp + t_tg;
                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;
                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
            }
        }
    }
    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@ -183,11 +183,6 @@ if [ "$1" -eq "1" ]; then
    make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32
    # batched-bench
    cd /workspace/llama.cpp
    GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32
    # parallel
    cd /workspace/llama.cpp
@ -202,6 +197,3 @@ fi
 #    GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0
 #fi
 # more benches
 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf  4096 1 99 1 512,3200 128,128,800 1
 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1