From b084f4dc46850f8b14ded3812a2b412a8d3f869f Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 28 Jun 2023 21:16:40 +0800 Subject: [PATCH] option for cublas --- CMakeLists.txt | 4 ++-- Makefile | 46 +++++++++++++++++++++++++++------------------- koboldcpp.py | 29 +++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index be50a7560..653e2613c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,12 +289,12 @@ if (GGML_CUDA_SOURCES) set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") endif() -set(TARGET koboldcpp) +set(TARGET koboldcpp_cublas) add_library(${TARGET} SHARED expose.cpp expose.h) target_include_directories(${TARGET} PUBLIC . ./otherarch ./otherarch/tools ./examples) target_compile_features(${TARGET} PUBLIC cxx_std_11) # don't bump set_target_properties(${TARGET} PROPERTIES PREFIX "") -set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp") +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas") set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/Makefile b/Makefile index ab6a55e3f..ddd85a955 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast +default: koboldcpp koboldcpp_failsafe koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast koboldcpp_cublas tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt dev: koboldcpp_openblas dev2: koboldcpp_clblast @@ -53,6 +53,9 @@ NONECFLAGS = OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas CLBLAST_FLAGS = -DGGML_USE_CLBLAST FAILSAFE_FLAGS = -DUSE_FAILSAFE +CUBLAS_FLAGS = +CUBLASLD_FLAGS = +CUBLAS_OBJS = #lets try enabling everything CFLAGS += -pthread -s @@ -133,10 +136,9 @@ endif # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better ifdef LLAMA_CUBLAS - CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include - LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib - OBJS += ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o + CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include + CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib + CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native ifdef LLAMA_CUDA_DMMV_X @@ -158,11 +160,11 @@ else NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h - $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS ifdef LLAMA_METAL @@ -199,7 +201,7 @@ FAILSAFE_BUILD = OPENBLAS_BUILD = OPENBLAS_NOAVX2_BUILD = CLBLAST_BUILD = -CLBLAST_NOAVX2_BUILD = +CUBLAS_BUILD = ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) @@ -207,7 +209,6 @@ ifeq ($(OS),Windows_NT) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) - CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) @@ -218,18 +219,23 @@ else ifdef LLAMA_CLBLAST ifeq ($(UNAME_S),Darwin) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) else CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) - CLBLAST_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif endif ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST + ifndef LLAMA_CUBLAS OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' endif endif + endif +endif + +#both windows and non windows +ifdef LLAMA_CUBLAS +CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o $@.dll $(CUBLASLD_FLAGS) $(LDFLAGS) endif # @@ -261,8 +267,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml_clblast_noavx2.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ +ggml_cublas.o: ggml.c ggml.h + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ #quants K k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h @@ -283,8 +289,8 @@ ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ +ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ #extreme old version compat ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h @@ -313,9 +319,11 @@ gpttype_adapter.o: gpttype_adapter.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ gpttype_adapter_clblast.o: gpttype_adapter.cpp $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ +gpttype_adapter_cublas.o: gpttype_adapter.cpp + $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so + rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @@ -334,8 +342,8 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml $(OPENBLAS_NOAVX2_BUILD) koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o $(OBJS) $(CLBLAST_BUILD) -koboldcpp_clblast_noavx2: ggml_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants_noavx2.o $(OBJS) - $(CLBLAST_NOAVX2_BUILD) +koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o $(CUBLAS_OBJS) $(OBJS) + $(CUBLAS_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/koboldcpp.py b/koboldcpp.py index 01da5e817..02d97f38d 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -77,15 +77,17 @@ lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.s lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so") lib_openblas_noavx2 = pick_existant_file("koboldcpp_openblas_noavx2.dll","koboldcpp_openblas_noavx2.so") lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so") +lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so") def init_library(): global handle - global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast + global lib_default,lib_failsafe,lib_openblas,lib_openblas_noavx2,lib_clblast,lib_cublas libname = "" use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. use_clblast = False #uses CLBlast instead + use_cublas = False #uses cublas instead use_noavx2 = False #uses openblas with no avx2 instructions if args.noavx2: @@ -103,6 +105,12 @@ def init_library(): else: print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.") use_clblast = True + elif args.usecublas: + if not file_exists(lib_cublas): + print("Warning: CuBLAS library file not found. Non-BLAS library will be used.") + else: + print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.") + use_cublas = True else: if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") @@ -122,6 +130,8 @@ def init_library(): else: if use_clblast: libname = lib_clblast + if use_cublas: + libname = lib_cublas elif use_blas: libname = lib_openblas else: @@ -581,13 +591,13 @@ def show_gui(): blaschoice = tk.StringVar() blaschoice.set("BLAS = 512") - runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"] + runopts = ["Use OpenBLAS","Use CLBLast GPU #1","Use CLBLast GPU #2","Use CLBLast GPU #3","Use CuBLAS GPU","Use No BLAS","Use OpenBLAS (Old CPU, noavx2)","Failsafe Mode (Old CPU, noavx)"] runchoice = tk.StringVar() runchoice.set("Use OpenBLAS") def onDropdownChange(event): sel = runchoice.get() - if sel==runopts[1] or sel==runopts[2] or sel==runopts[3]: + if sel==runopts[1] or sel==runopts[2] or sel==runopts[3] or sel==runopts[4]: frameC.grid(row=4,column=0,pady=4) else: frameC.grid_forget() @@ -609,7 +619,7 @@ def show_gui(): frameC = tk.Frame(root) gpu_layers_var=tk.StringVar() gpu_layers_var.set("0") - gpu_lbl = tk.Label(frameC, text = 'GPU Layers (CLBlast only): ', font=('calibre',10, 'bold')) + gpu_lbl = tk.Label(frameC, text = 'GPU Layers: ', font=('calibre',10, 'bold')) gpu_layers_input = tk.Entry(frameC,textvariable = gpu_layers_var, font=('calibre',10,'normal')) gpu_lbl.grid(row=0,column=0) gpu_layers_input.grid(row=0,column=1) @@ -663,11 +673,13 @@ def show_gui(): if selrunchoice==runopts[3]: args.useclblast = [0,1] if selrunchoice==runopts[4]: - args.noblas = True + args.usecublas = True if selrunchoice==runopts[5]: - args.noavx2 = True + args.noblas = True if selrunchoice==runopts[6]: args.noavx2 = True + if selrunchoice==runopts[7]: + args.noavx2 = True args.noblas = True args.nommap = True print("[Failsafe Mode : mmap is disabled.]") @@ -861,7 +873,8 @@ if __name__ == '__main__': parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+') compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') - compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using CLBlast. Requires CLBlast.",metavar=('[GPU layers]'), type=int, default=0) + compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU.", action='store_true') + parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0) args = parser.parse_args() main(args)