Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile
Allowed values are 1 or 2. 2 gives the best performance on modern GPUs and is set as default. On older GPUs 1 may work better.
This commit is contained in:
parent
3edee085ea
commit
31b20758c8
3 changed files with 8 additions and 1 deletions
|
@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
|||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
|
@ -190,6 +191,7 @@ if (LLAMA_CUBLAS)
|
|||
add_compile_definitions(GGML_USE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
|
|
5
Makefile
5
Makefile
|
@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y
|
|||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
||||
endif # LLAMA_CUDA_DMMV_Y
|
||||
#ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
#else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
#endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif # LLAMA_CUBLAS
|
||||
|
|
|
@ -168,7 +168,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|||
#endif
|
||||
|
||||
#ifndef K_QUANTS_PER_ITERATION
|
||||
#define K_QUANTS_PER_ITERATION 1
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
#endif
|
||||
|
||||
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue