From 94cb00a3cff052d0681e0f673fd137d783cc3105 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Thu, 20 Apr 2023 20:57:16 -0400 Subject: [PATCH] alternate implementation of setting different n_batch for BLAS --- Makefile | 3 --- examples/common.h | 7 ++----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index f3434fbfe..f267d0864 100644 --- a/Makefile +++ b/Makefile @@ -93,18 +93,15 @@ ifndef LLAMA_NO_ACCELERATE # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). ifeq ($(UNAME_S),Darwin) CFLAGS += -DGGML_USE_ACCELERATE - CXXFLAGS += -DGGML_USE_ACCELERATE LDFLAGS += -framework Accelerate endif endif ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas - CXXFLAGS += -DGGML_USE_OPENBLAS LDFLAGS += -lopenblas endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include - CXXFLAGS += -DGGML_USE_CUBLAS LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 OBJS += ggml-cuda.o ggml-cuda.o: ggml-cuda.cu ggml-cuda.h diff --git a/examples/common.h b/examples/common.h index 71fc75113..d20aacd02 100644 --- a/examples/common.h +++ b/examples/common.h @@ -2,6 +2,7 @@ #pragma once +#include "ggml.h" #include "llama.h" #include @@ -20,11 +21,7 @@ struct gpt_params { int32_t repeat_last_n = 64; // last n tokens to penalize int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS) - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) -#else - int32_t n_batch = 8; // batch size for prompt processing -#endif + int32_t n_batch = ggml_cpu_has_blas() ? 512 : 8; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt // sampling parameters