From 5545e8ff920ae15d0a17d4b0b3347da8ba7c7707 Mon Sep 17 00:00:00 2001 From: Wenjing Yu Date: Fri, 26 Jul 2024 16:50:28 -0700 Subject: [PATCH] remove batch-benched --- Makefile | 3 +- examples/CMakeLists.txt | 1 - examples/batched-bench/CMakeLists.txt | 5 - examples/batched-bench/README.md | 51 ------ examples/batched-bench/batched-bench.cpp | 215 ----------------------- scripts/pod-llama.sh | 8 - 6 files changed, 1 insertion(+), 282 deletions(-) delete mode 100644 examples/batched-bench/CMakeLists.txt delete mode 100644 examples/batched-bench/README.md delete mode 100644 examples/batched-bench/batched-bench.cpp diff --git a/Makefile b/Makefile index 8d66ad994..72ef316ea 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ libllava.a \ - llama-batched-bench \ llama-bench \ llama-benchmark-matmult \ llama-cli \ @@ -39,7 +38,7 @@ BUILD_TARGETS = \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ - simple batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \ + simple save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \ retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8604639e8..a157473c5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(cvector-generator) - add_subdirectory(batched-bench) add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt deleted file mode 100644 index 959acaeee..000000000 --- a/examples/batched-bench/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-batched-bench) -add_executable(${TARGET} batched-bench.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md deleted file mode 100644 index 4a07fe6bb..000000000 --- a/examples/batched-bench/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# llama.cpp/example/batched-bench - -Benchmark the batched decoding performance of `llama.cpp` - -## Usage - -There are 2 modes of operation: - -- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`) -- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) - -```bash -./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] - -# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 - -# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps - -# custom set of batches -./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 -``` - -## Sample results - -- `PP` - prompt tokens per batch -- `TG` - generated tokens per batch -- `B` - number of batches -- `N_KV` - required KV cache size -- `T_PP` - prompt processing time (i.e. time to first token) -- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`) -- `T_TG` - time to generate all batches -- `S_TG` - text generation speed (`(B*TG)/T_TG`) -- `T` - total time -- `S` - total speed (i.e. all tokens / total time) - -| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | -|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 | -| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 | -| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 | -| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 | -| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 | -| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 | -| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 | -| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 | -| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 | -| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 | -| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 | -| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 | diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp deleted file mode 100644 index 718f0a61a..000000000 --- a/examples/batched-bench/batched-bench.cpp +++ /dev/null @@ -1,215 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include - -// mutates the input string -static std::vector parse_list(char * p) { - std::vector ret; - - char * q = p; - - while (*p) { - if (*p == ',') { - *p = '\0'; - ret.push_back(std::atoi(q)); - q = p + 1; - } - - ++p; - } - - ret.push_back(std::atoi(q)); - - return ret; -} - -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); - LOG_TEE("\n"); -} - -int main(int argc, char ** argv) { - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); - return 1; - } - - int is_pp_shared = params.is_pp_shared; - - std::vector n_pp = params.n_pp; - std::vector n_tg = params.n_tg; - std::vector n_pl = params.n_pl; - - // init LLM - - llama_backend_init(); - llama_numa_init(params.numa); - - // initialize the model - - llama_model_params model_params = llama_model_params_from_gpt_params(params); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); - - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return 1; - } - - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - - // ensure enough sequences are available - ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); - - llama_context * ctx = llama_new_context_with_model(model, ctx_params); - - if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); - return 1; - } - - const int32_t n_kv_max = llama_n_ctx(ctx); - - llama_batch batch = llama_batch_init(n_kv_max, 0, 1); - - // decode in batches of ctx_params.n_batch tokens - auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); - return false; - } - - llama_synchronize(ctx); - } - - return true; - }; - - // warm up - { - for (int i = 0; i < 16; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); - } - - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return 1; - } - } - - LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); - LOG_TEE("\n"); - - LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); - LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); - - for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { - for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { - for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) { - const int pp = n_pp[i_pp]; - const int tg = n_tg[i_tg]; - const int pl = n_pl[i_pl]; - - const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg); - - if (n_ctx_req > n_kv_max) { - continue; - } - - llama_batch_clear(batch); - - for (int i = 0; i < pp; ++i) { - for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { - llama_batch_add(batch, 0, i, { j }, false); - } - } - batch.logits[batch.n_tokens - 1] = true; - - const auto t_pp_start = ggml_time_us(); - - llama_kv_cache_clear(ctx); - - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return 1; - } - - if (is_pp_shared) { - for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - } - - const auto t_pp_end = ggml_time_us(); - - const auto t_tg_start = ggml_time_us(); - - for (int i = 0; i < tg; ++i) { - llama_batch_clear(batch); - - for (int j = 0; j < pl; ++j) { - llama_batch_add(batch, 0, pp + i, { j }, true); - } - - if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return 1; - } - } - - const auto t_tg_end = ggml_time_us(); - - const int32_t n_kv = n_ctx_req; - - const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f; - const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f; - const float t = t_pp + t_tg; - - const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp; - const float speed_tg = pl*tg / t_tg; - const float speed = n_kv / t; - - LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); - } - } - } - - llama_print_timings(ctx); - - llama_batch_free(batch); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - fprintf(stderr, "\n\n"); - - return 0; -} diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 44fb11b58..9099987d7 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -183,11 +183,6 @@ if [ "$1" -eq "1" ]; then make -j && ./bin/llama-perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 - # batched-bench - cd /workspace/llama.cpp - - GGML_CUDA=1 make -j && ./llama-batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 - # parallel cd /workspace/llama.cpp @@ -202,6 +197,3 @@ fi # GGML_CUDA=1 make -j && ./llama-speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi -# more benches -#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1