diff --git a/Makefile b/Makefile index 41600b5d5..dea93d83e 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast +default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2 koboldcpp_clblast simple: koboldcpp koboldcpp_noavx2 dev: koboldcpp_openblas @@ -15,9 +15,13 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -ifndef ARCH_LINUX -ARCH_LINUX := $(shell grep "Arch Linux" /etc/os-release 2>/dev/null) -ARCH_LIKE := $(shell grep "ID_LIKE=arch" /etc/os-release 2>/dev/null) +ARCH_LINUX1 := $(shell grep "Arch Linux" /etc/os-release 2>/dev/null) +ARCH_LINUX2 := $(shell grep "ID_LIKE=arch" /etc/os-release 2>/dev/null) +ifdef ARCH_LINUX1 +ARCH_ADD = -lcblas +endif +ifdef ARCH_LINUX2 +ARCH_ADD = -lcblas endif CCV := $(shell $(CC) --version | head -n 1) @@ -51,6 +55,7 @@ BONUSCFLAGS2 = OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas CLBLAST_FLAGS = -DGGML_USE_CLBLAST -DGGML_USE_OPENBLAS -I/usr/local/include/openblas +CUBLAS_FLAGS = -DGGML_USE_CUBLAS -I/usr/local/cuda/include #lets try enabling everything CFLAGS += -pthread -s @@ -152,6 +157,7 @@ NOAVX2_BUILD = OPENBLAS_BUILD = OPENBLAS_NOAVX2_BUILD = CLBLAST_BUILD = +CUBLAS_BUILD = ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) @@ -159,22 +165,28 @@ ifeq ($(OS),Windows_NT) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) else DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.so $(LDFLAGS) ifdef LLAMA_OPENBLAS - OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ -lcblas -lopenblas -shared -o $@.so $(LDFLAGS) - OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -lcblas -lopenblas -shared -o $@.so $(LDFLAGS) + OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif ifdef LLAMA_CLBLAST - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL -lcblas -lopenblas -shared -o $@.so $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) + endif + ifdef LLAMA_CUBLAS + CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64 -shared -o $@.so $(LDFLAGS) endif ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST + ifndef LLAMA_CUBLAS OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' endif endif + endif endif # @@ -211,6 +223,9 @@ ggml_openblas_noavx2.o: ggml.c ggml.h ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@ +ggml_cublas.o: ggml.c ggml.h + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CUBLAS_FLAGS) -c $< -o $@ + ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ @@ -258,6 +273,9 @@ koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_rwkv.o ggml_v1_noavx2.o e koboldcpp_clblast: ggml_clblast.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(CLBLAST_BUILD) + +koboldcpp_cublas: ggml_cublas.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o + $(CUBLAS_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/koboldcpp.py b/koboldcpp.py index 812141d22..94e674763 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -372,7 +372,7 @@ def main(args): if args.noavx2: use_noavx2 = True - if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_openblas_noavx2)) or not (os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) and os.name=='nt'): + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_openblas_noavx2)) or (os.name=='nt' and not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll"))): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") elif args.noblas: print("Attempting to use non-avx2 compatibility library without OpenBLAS.") @@ -380,13 +380,13 @@ def main(args): use_blas = True print("Attempting to use non-avx2 compatibility library with OpenBLAS. A compatible libopenblas will be required.") elif args.useclblast: - if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_clblast)) or not (os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "clblast.dll")) and os.name=='nt'): + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_clblast)) or (os.name=='nt' and not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "clblast.dll"))): print("Warning: CLBlast library file not found. Non-BLAS library will be used.") else: print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.") use_clblast = True else: - if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_openblas)) or not (os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) and os.name=='nt'): + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), lib_openblas)) or (os.name=='nt' and not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll"))): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") elif args.noblas: print("Attempting to library without OpenBLAS.") diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 6a9cf5205..f513402e3 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -371,7 +371,7 @@ bool gpt2_eval( const int n_vocab = hparams.n_vocab; //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now - static size_t buf_size = 512u*1024*1024; + static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N*1.6 > buf_size) { diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 06752ff9a..307ee5a4e 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -382,7 +382,7 @@ bool gptj_eval( const int d_key = n_embd/n_head; //todo: there is a bug that causes the buffer to oom and I cannot figure it out, hack to increase size for now - static size_t buf_size = 512u*1024*1024; + static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); if (mem_per_token > 0 && mem_per_token*N*1.4 > buf_size) {