cuda : rename build flag to LLAMA_CUDA

ggml-ci
2024-03-25 16:04:50 +01:00 · 2024-03-25 16:04:50 +01:00 · 4a60e88065
commit 4a60e88065
parent ae1f211ce2
17 changed files with 91 additions and 77 deletions
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 %build
-make -j LLAMA_CUBLAS=1
+make -j LLAMA_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -20,8 +20,8 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
+# Enable CUDA
-ENV LLAMA_CUBLAS=1
+ENV LLAMA_CUDA=1
 RUN make
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
+        (cmakeBool "LLAMA_CUDA" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -20,8 +20,8 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
+# Enable CUDA
-ENV LLAMA_CUBLAS=1
+ENV LLAMA_CUDA=1
 RUN make
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -89,8 +89,8 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
-#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
 endif()
 if (LLAMA_CUBLAS)
    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
    set(LLAMA_CUDA ON)
 endif()
 if (LLAMA_CUDA)
    cmake_minimum_required(VERSION 3.17)
    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
+        message(STATUS "CUDA found")
        enable_language(CUDA)
@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_CUDA)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
    else()
-        message(WARNING "cuBLAS not found")
+        message(WARNING "CUDA not found")
    endif()
 endif()
@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
    if (LLAMA_HIP_UMA)
        add_compile_definitions(GGML_HIP_UMA)
@ -830,7 +835,7 @@ endif()
 set(CUDA_CXX_FLAGS "")
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    set(CUDA_FLAGS -use_fast_math)
    if (LLAMA_FATAL_WARNINGS)
@ -1055,7 +1060,7 @@ endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
--- a/26
+++ b/26
@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS
 ifdef LLAMA_CUBLAS
 #	$(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
 	LLAMA_CUDA := 1
 endif
 ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@ -462,7 +467,7 @@ endif
 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
 define NVCC_COMPILE
@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 ifdef LLAMA_CLBLAST
@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
 $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 $(info )
 ifdef LLAMA_CUBLAS
 $(info !!!!)
 $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
 $(info !!!!)
 $(info )
 endif
 #
 # Build library
 #
--- a/README.md
+++ b/README.md
@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
- #### cuBLAS
+- #### CUDA
-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
  - Using `make`:
    ```bash
-    make LLAMA_CUBLAS=1
+    make LLAMA_CUDA=1
    ```
  - Using `CMake`:
    ```bash
    mkdir build
    cd build
-    cmake .. -DLLAMA_CUBLAS=ON
+    cmake .. -DLLAMA_CUDA=ON
    cmake --build . --config Release
    ```
  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
 <!---
  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
 --->
  | Option                         | Legal values           | Default | Description |
  |--------------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
--- a/ci/run.sh
+++ b/ci/run.sh
@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
--- a/common/common.cpp
+++ b/common/common.cpp
@ -48,12 +48,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#define GGML_USE_CUDA_SYCL
 #endif
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
+#define GGML_USE_CUDA_SYCL_VULKAN
 #endif
 #if defined(LLAMA_USE_CURL)
@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL
        return true;
    }
    if (arg == "--split-mode" || arg == "-sm") {
@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#ifndef GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL
        return true;
    }
    if (arg == "--tensor-split" || arg == "-ts") {
@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
                params.tensor_split[i] = 0.0f;
            }
        }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
    if (arg == "--no-mmap") {
@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
+    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 ```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -113,7 +113,7 @@ static std::string get_cpu_info() {
 static std::string get_gpu_info() {
    std::string id;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    int count = ggml_backend_cuda_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
@ -808,7 +808,7 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2510,15 +2510,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];
            // split string by , and /
@ -2535,17 +2535,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUDA
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
 #endif
        } else if (arg == "--lora") {
            if (++i >= argc) {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
    // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
    ggml_backend_cuda_reg_devices();
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -21674,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
 #endif
 }
-int ggml_cpu_has_cublas(void) {
+int ggml_cpu_has_cuda(void) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return 1;
 #else
    return 0;
@ -21722,7 +21722,7 @@ int ggml_cpu_has_sycl(void) {
 }
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
+    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
           ggml_cpu_has_sycl();
 }
--- a/ggml.h
+++ b/ggml.h
@ -2354,7 +2354,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_cuda     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_vulkan     (void);
    GGML_API int ggml_cpu_has_kompute    (void);
--- a/llama.cpp
+++ b/llama.cpp
@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
    ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    // host buffers should only be used when data is expected to be copied to/from the GPU
    if (host_buffer) {
        buft = ggml_backend_cuda_host_buffer_type();
@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 #ifdef GGML_USE_METAL
    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
    buft = ggml_backend_cuda_buffer_type(gpu);
 #elif defined(GGML_USE_VULKAN)
    buft = ggml_backend_vk_buffer_type(gpu);
@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
    ggml_backend_buffer_type_t buft = nullptr;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    if (ggml_backend_cuda_get_device_count() > 1) {
        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
    }
@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 }
 static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    return ggml_backend_cuda_get_device_count();
 #elif defined(GGML_USE_SYCL)
    return ggml_backend_sycl_get_device_count();
@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() {
 }
 static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUDA)
    size_t total;
    size_t free;
    ggml_backend_cuda_get_device_memory(device, &total, &free);
@ -2080,7 +2080,7 @@ struct llama_model {
            ggml_free(ctx);
        }
        for (ggml_backend_buffer_t buf : bufs) {
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
            if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
                ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
            }
@ -5269,7 +5269,7 @@ static bool llm_load_tensors(
                }
                model.bufs.push_back(buf);
                bufs.emplace(idx, buf);
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
                if (n_layer >= n_gpu_layers) {
                    ggml_backend_cuda_register_host_buffer(
                        ggml_backend_buffer_get_base(buf),
@ -13371,7 +13371,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 size_t llama_max_devices(void) {
 #if defined(GGML_USE_METAL)
    return 1;
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
    return GGML_CUDA_MAX_DEVICES;
 #elif defined(GGML_USE_SYCL)
    return GGML_SYCL_MAX_DEVICES;
@ -13391,8 +13391,8 @@ bool llama_supports_mlock(void) {
 }
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL)   || defined(GGML_USE_KOMPUTE)
+    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
 #else
@ -13597,7 +13597,7 @@ struct llama_context * llama_new_context_with_model(
            }
            ctx->backends.push_back(ctx->backend_metal);
        }
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUDA)
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
            ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@ -13744,7 +13744,7 @@ struct llama_context * llama_new_context_with_model(
            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
            bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
            // pipeline parallelism requires support for async compute and events
            // currently this is only implemented in the CUDA backend
            pipeline_parallel = false;