cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
This commit is contained in:
slaren 2023-04-29 02:04:18 +02:00 committed by GitHub
parent b1ee8f59b4
commit 7fc50c051a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 110 additions and 54 deletions

View file

@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
@ -164,10 +165,10 @@ $(info )
# Build library
#
ggml.o: ggml.c ggml.h
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp ggml.h llama.h llama_util.h
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h