diff --git a/common/common.h b/common/common.h index c50dc2f11..0f178b9eb 100644 --- a/common/common.h +++ b/common/common.h @@ -51,7 +51,7 @@ struct gpt_params { int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size - int32_t n_batch = 4096; // logical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_draft = 5; // number of tokens to draft during speculative decoding diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 4a6ebd8df..bf94e7e7a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -184,7 +184,7 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, - /* n_batch */ {4096}, + /* n_batch */ {2048}, /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, diff --git a/llama.cpp b/llama.cpp index 39d498708..38e7036a7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12579,7 +12579,7 @@ struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, - /*.n_batch =*/ 4096, + /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default