reduce default n_batch to 2048

2024-03-13 18:37:17 +01:00 · 2024-03-13 18:37:17 +01:00 · 976176d0dd
commit 976176d0dd
parent 1f564815a3
3 changed files with 3 additions and 3 deletions
--- a/common/common.h
+++ b/common/common.h
@ -51,7 +51,7 @@ struct gpt_params {
    int32_t n_threads_batch_draft = -1;
    int32_t n_predict             = -1;    // new tokens to predict
    int32_t n_ctx                 = 512;   // context size
-    int32_t n_batch               = 4096;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -184,7 +184,7 @@ static const cmd_params cmd_params_defaults = {
    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
-    /* n_batch       */ {4096},
+    /* n_batch       */ {2048},
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
--- a/llama.cpp
+++ b/llama.cpp
@ -12579,7 +12579,7 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
-        /*.n_batch                     =*/ 4096,
+        /*.n_batch                     =*/ 2048,
        /*.n_ubatch                    =*/ 512,
        /*.n_seq_max                   =*/ 1,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default