set default n_batch to 512 when using BLAS

2023-04-20 17:04:31 -04:00 · 2023-04-20 17:04:31 -04:00 · 4b781c2055
commit 4b781c2055
parent 12b5900dbc
2 changed files with 7 additions and 0 deletions
--- a/3
+++ b/3
@ -93,15 +93,18 @@ ifndef LLAMA_NO_ACCELERATE
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
+		CXXFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	CXXFLAGS  += -DGGML_USE_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
+	CXXFLAGS  += -DGGML_USE_CUBLAS
 	LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
 	OBJS	+= ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
--- a/examples/common.h
+++ b/examples/common.h
@ -20,7 +20,11 @@ struct gpt_params {
    int32_t repeat_last_n = 64;   // last n tokens to penalize
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512;  // context size
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS)
+    int32_t n_batch       = 512;    // batch size for prompt processing (must be >=32 to use BLAS)
+#else
    int32_t n_batch       = 8;    // batch size for prompt processing
+#endif
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters