From 94cb00a3cff052d0681e0f673fd137d783cc3105 Mon Sep 17 00:00:00 2001
From: eiery <19350831+eiery@users.noreply.github.com>
Date: Thu, 20 Apr 2023 20:57:16 -0400
Subject: [PATCH] alternate implementation of setting different n_batch for
 BLAS

---
 Makefile          | 3 ---
 examples/common.h | 7 ++-----
 2 files changed, 2 insertions(+), 8 deletions(-)
diff --git a/Makefile b/Makefile
index f3434fbfe..f267d0864 100644
--- a/Makefile
+++ b/Makefile
@@ -93,18 +93,15 @@ ifndef LLAMA_NO_ACCELERATE
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
-		CXXFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-	CXXFLAGS  += -DGGML_USE_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
-	CXXFLAGS  += -DGGML_USE_CUBLAS
 	LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
 	OBJS	+= ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
diff --git a/examples/common.h b/examples/common.h
index 71fc75113..d20aacd02 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include "ggml.h"
 #include "llama.h"
 
 #include <string>
@@ -20,11 +21,7 @@ struct gpt_params {
     int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS)
-    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-#else
-    int32_t n_batch       = 8;    // batch size for prompt processing
-#endif
+    int32_t n_batch       = ggml_cpu_has_blas() ? 512 : 8; // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters