From 4b781c20558f0f1694b737f1c3117e2779ccec96 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Thu, 20 Apr 2023 17:04:31 -0400 Subject: [PATCH] set default n_batch to 512 when using BLAS --- Makefile | 3 +++ examples/common.h | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/Makefile b/Makefile index f267d0864..f3434fbfe 100644 --- a/Makefile +++ b/Makefile @@ -93,15 +93,18 @@ ifndef LLAMA_NO_ACCELERATE # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). ifeq ($(UNAME_S),Darwin) CFLAGS += -DGGML_USE_ACCELERATE + CXXFLAGS += -DGGML_USE_ACCELERATE LDFLAGS += -framework Accelerate endif endif ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas + CXXFLAGS += -DGGML_USE_OPENBLAS LDFLAGS += -lopenblas endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include + CXXFLAGS += -DGGML_USE_CUBLAS LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 OBJS += ggml-cuda.o ggml-cuda.o: ggml-cuda.cu ggml-cuda.h diff --git a/examples/common.h b/examples/common.h index cbbc2dfab..734bcd3fa 100644 --- a/examples/common.h +++ b/examples/common.h @@ -20,7 +20,11 @@ struct gpt_params { int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS) + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) +#else int32_t n_batch = 8; // batch size for prompt processing +#endif int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters