diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec index f847ebb1e..6cd6e4c72 100644 --- a/.devops/llama-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cublas.srpm.spec @@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUBLAS=1 +make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 2b7faf7c1..b937a4829 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index c7fa2203a..723161029 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation ( (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_MPI" useMpi) diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile index 4f83904bc..5683a3646 100644 --- a/.devops/server-cuda.Dockerfile +++ b/.devops/server-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/CMakeLists.txt b/CMakeLists.txt index b25cfd2fc..3f23ba4d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,8 +89,8 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use CUDA" OFF) -#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +option(LLAMA_CUDA "llama: use CUDA" OFF) +option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") @@ -360,11 +360,16 @@ if (LLAMA_QKK_64) endif() if (LLAMA_CUBLAS) + message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") + set(LLAMA_CUDA ON) +endif() + +if (LLAMA_CUDA) cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) if (CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") + message(STATUS "CUDA found") enable_language(CUDA) @@ -373,7 +378,7 @@ if (LLAMA_CUBLAS) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") - add_compile_definitions(GGML_USE_CUBLAS) + add_compile_definitions(GGML_USE_CUDA) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() @@ -422,7 +427,7 @@ if (LLAMA_CUBLAS) message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") else() - message(WARNING "cuBLAS not found") + message(WARNING "CUDA not found") endif() endif() @@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS) file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) if (LLAMA_HIP_UMA) add_compile_definitions(GGML_HIP_UMA) @@ -830,7 +835,7 @@ endif() set(CUDA_CXX_FLAGS "") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) set(CUDA_FLAGS -use_fast_math) if (LLAMA_FATAL_WARNINGS) @@ -1055,7 +1060,7 @@ endif() add_compile_options("$<$:${ARCH_FLAGS}>") add_compile_options("$<$:${ARCH_FLAGS}>") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") diff --git a/Makefile b/Makefile index 08eafb1e7..35bdb1eb7 100644 --- a/Makefile +++ b/Makefile @@ -390,12 +390,17 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_CUBLAS +# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.) + LLAMA_CUDA := 1 +endif + +ifdef LLAMA_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda else CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) @@ -462,7 +467,7 @@ endif ifdef JETSON_EOL_MODULE_DETECT define NVCC_COMPILE - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else define NVCC_COMPILE @@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(NVCC_COMPILE) -endif # LLAMA_CUBLAS +endif # LLAMA_CUDA ifdef LLAMA_CLBLAST @@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA ifdef LLAMA_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA endif # LLAMA_HIP_UMA @@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) # identify CUDA host compiler -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic @@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) @@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # LLAMA_CUBLAS +endif # LLAMA_CUDA $(info ) +ifdef LLAMA_CUBLAS +$(info !!!!) +$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) +$(info !!!!) +$(info ) +endif + # # Build library # diff --git a/README.md b/README.md index f9cf19616..ce678f0c3 100644 --- a/README.md +++ b/README.md @@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements Check [Optimizing and Running LLaMA2 on IntelĀ® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. -- #### cuBLAS +- #### CUDA - This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). + This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. - Using `make`: ```bash - make LLAMA_CUBLAS=1 + make LLAMA_CUDA=1 ``` - Using `CMake`: ```bash mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_CUDA=ON cmake --build . --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | |--------------------------------|------------------------|---------|-------------| | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | diff --git a/ci/run.sh b/ci/run.sh index 51f4c74cc..85acc46d3 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} diff --git a/common/common.cpp b/common/common.cpp index 9dec08430..5fd33e2a1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -48,12 +48,12 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUBLAS_SYCL +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUDA_SYCL #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUBLAS_SYCL_VULKAN +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUDA_SYCL_VULKAN #endif #if defined(LLAMA_USE_CURL) @@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--split-mode" || arg == "-sm") { @@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--tensor-split" || arg == "-ts") { @@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN return true; } if (arg == "--no-mmap") { @@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 578e8fc27..458c01b87 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 82413b79d..27e113203 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -113,7 +113,7 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; @@ -808,7 +808,7 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cublas(); +const bool test::cuda = !!ggml_cpu_has_cuda(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::kompute = !!ggml_cpu_has_kompute(); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 48caafa87..40c976261 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -7,7 +7,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif @@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); printf("%s: CLIP using CUDA backend\n", __func__); #endif diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 338e60f28..c4c545c3e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2510,15 +2510,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#ifndef GGML_USE_CUBLAS - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS +#ifndef GGML_USE_CUDA + fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) std::string arg_next = argv[i]; // split string by , and / @@ -2535,17 +2535,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } } #else - LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUBLAS + LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {}); +#endif // GGML_USE_CUDA } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; break; } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) params.main_gpu = std::stoi(argv[i]); #else - LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); + LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {}); #endif } else if (arg == "--lora") { if (++i >= argc) { diff --git a/ggml-backend.c b/ggml-backend.c index 6026570ae..402d86ef3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); // add forward decls here to avoid including the backend headers -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA extern GGML_CALL void ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); #endif diff --git a/ggml.c b/ggml.c index 203a9e540..62b833959 100644 --- a/ggml.c +++ b/ggml.c @@ -21674,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) return 1; #else return 0; #endif } -int ggml_cpu_has_cublas(void) { -#if defined(GGML_USE_CUBLAS) +int ggml_cpu_has_cuda(void) { +#if defined(GGML_USE_CUDA) return 1; #else return 0; @@ -21722,7 +21722,7 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || + return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); } diff --git a/ggml.h b/ggml.h index 0a5af7205..efb5177d7 100644 --- a/ggml.h +++ b/ggml.h @@ -2354,7 +2354,7 @@ extern "C" { GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_blas (void); - GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_cuda (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_kompute (void); diff --git a/llama.cpp b/llama.cpp index 61587cb7a..384021efd 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,7 +7,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" @@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) // host buffers should only be used when data is expected to be copied to/from the GPU if (host_buffer) { buft = ggml_backend_cuda_host_buffer_type(); @@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { #ifdef GGML_USE_METAL buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(gpu); @@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_cuda_get_device_count() > 1) { buft = ggml_backend_cuda_split_buffer_type(tensor_split); } @@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g } static size_t llama_get_device_count() { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) return ggml_backend_cuda_get_device_count(); #elif defined(GGML_USE_SYCL) return ggml_backend_sycl_get_device_count(); @@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() { } static size_t llama_get_device_memory(int device) { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) size_t total; size_t free; ggml_backend_cuda_get_device_memory(device, &total, &free); @@ -2080,7 +2080,7 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); } @@ -5269,7 +5269,7 @@ static bool llm_load_tensors( } model.bufs.push_back(buf); bufs.emplace(idx, buf); -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (n_layer >= n_gpu_layers) { ggml_backend_cuda_register_host_buffer( ggml_backend_buffer_get_base(buf), @@ -13371,7 +13371,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { size_t llama_max_devices(void) { #if defined(GGML_USE_METAL) return 1; -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) return GGML_CUDA_MAX_DEVICES; #elif defined(GGML_USE_SYCL) return GGML_SYCL_MAX_DEVICES; @@ -13391,8 +13391,8 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -13597,7 +13597,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); @@ -13744,7 +13744,7 @@ struct llama_context * llama_new_context_with_model( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; -#ifndef GGML_USE_CUBLAS +#ifndef GGML_USE_CUDA // pipeline parallelism requires support for async compute and events // currently this is only implemented in the CUDA backend pipeline_parallel = false;