alternate implementation of setting different n_batch for BLAS
This commit is contained in:
parent
c6dfc44a37
commit
94cb00a3cf
2 changed files with 2 additions and 8 deletions
3
Makefile
3
Makefile
|
@ -93,18 +93,15 @@ ifndef LLAMA_NO_ACCELERATE
|
|||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -DGGML_USE_ACCELERATE
|
||||
CXXFLAGS += -DGGML_USE_ACCELERATE
|
||||
LDFLAGS += -framework Accelerate
|
||||
endif
|
||||
endif
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
CXXFLAGS += -DGGML_USE_OPENBLAS
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
||||
OBJS += ggml-cuda.o
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <string>
|
||||
|
@ -20,11 +21,7 @@ struct gpt_params {
|
|||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined (GGML_USE_CUBLAS)
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
#else
|
||||
int32_t n_batch = 8; // batch size for prompt processing
|
||||
#endif
|
||||
int32_t n_batch = ggml_cpu_has_blas() ? 512 : 8; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
||||
// sampling parameters
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue