alternate implementation of setting different n_batch for BLAS

This commit is contained in:
eiery 2023-04-20 20:57:16 -04:00
parent c6dfc44a37
commit 94cb00a3cf
2 changed files with 2 additions and 8 deletions

View file

@ -93,18 +93,15 @@ ifndef LLAMA_NO_ACCELERATE
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
CXXFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
CXXFLAGS += -DGGML_USE_OPENBLAS
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
CXXFLAGS += -DGGML_USE_CUBLAS
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h

View file

@ -2,6 +2,7 @@
#pragma once
#include "ggml.h"
#include "llama.h"
#include <string>
@ -20,11 +21,7 @@ struct gpt_params {
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS)
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
#else
int32_t n_batch = 8; // batch size for prompt processing
#endif
int32_t n_batch = ggml_cpu_has_blas() ? 512 : 8; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters