From ddaa4f2a264820b4379baeba0318c5e89951e1a5 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 7 Jul 2023 22:14:14 +0800 Subject: [PATCH] fix cuda garbage results and gpu selection issues --- CMakeLists.txt | 13 +++++++++++++ Makefile | 4 +++- koboldcpp.py | 10 ++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af4b4823f..9d5ae48e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,7 @@ endif() option(LLAMA_CUBLAS "llama: use cuBLAS" ON) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -76,8 +77,11 @@ if (LLAMA_CUBLAS) set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h) add_compile_definitions(GGML_USE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (LLAMA_CUDA_DMMV_F16) add_compile_definitions(GGML_CUDA_DMMV_F16) endif() @@ -89,6 +93,15 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if (LLAMA_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() message(WARNING "cuBLAS not found") endif() diff --git a/Makefile b/Makefile index e1c3869a2..21fcce4e1 100644 --- a/Makefile +++ b/Makefile @@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o NVCC = nvcc - NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native + NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV ifdef LLAMA_CUDA_DMMV_X NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) else NVCCFLAGS += -DGGML_CUDA_DMMV_X=32 endif # LLAMA_CUDA_DMMV_X ifdef LLAMA_CUDA_DMMV_Y + NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y) else NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 + NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 endif # LLAMA_CUDA_DMMV_Y ifdef LLAMA_CUDA_DMMV_F16 NVCCFLAGS += -DGGML_CUDA_DMMV_F16 diff --git a/koboldcpp.py b/koboldcpp.py index b46bb991c..53ffe41c3 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -191,10 +191,12 @@ def load_model(model_filename): clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) inputs.clblast_info = clblastids inputs.cublas_info = 0 - if (args.usecublas and "1" in args.usecublas): - inputs.cublas_info = 1 + if (args.usecublas and "0" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + elif (args.usecublas and "1" in args.usecublas): + os.environ["CUDA_VISIBLE_DEVICES"] = "1" elif (args.usecublas and "2" in args.usecublas): - inputs.cublas_info = 2 + os.environ["CUDA_VISIBLE_DEVICES"] = "2" inputs.executable_path = (getdirpath()+"/").encode("UTF-8") inputs.debugmode = args.debugmode banned_tokens = args.bantokens @@ -267,7 +269,7 @@ maxhordectx = 1024 maxhordelen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.34" +KcppVersion = "1.34.2" showdebug = True class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):