cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
This commit is contained in:
parent
b1ee8f59b4
commit
7fc50c051a
6 changed files with 110 additions and 54 deletions
5
Makefile
5
Makefile
|
@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
|
|||
endif
|
||||
ifdef LLAMA_CUBLAS
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
|
@ -164,10 +165,10 @@ $(info )
|
|||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp ggml.h llama.h llama_util.h
|
||||
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue