cuda : rename build flag to LLAMA_CUDA

ggml-ci
This commit is contained in:
slaren 2024-03-25 16:04:50 +01:00
parent ae1f211ce2
commit 4a60e88065
17 changed files with 91 additions and 77 deletions

View file

@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
%setup -n llama.cpp-master %setup -n llama.cpp-master
%build %build
make -j LLAMA_CUBLAS=1 make -j LLAMA_CUDA=1
%install %install
mkdir -p %{buildroot}%{_bindir}/ mkdir -p %{buildroot}%{_bindir}/

View file

@ -20,8 +20,8 @@ COPY . .
# Set nvcc architecture # Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS # Enable CUDA
ENV LLAMA_CUBLAS=1 ENV LLAMA_CUDA=1
RUN make RUN make

View file

@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL) (cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "LLAMA_CUBLAS" useCuda) (cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi) (cmakeBool "LLAMA_MPI" useMpi)

View file

@ -20,8 +20,8 @@ COPY . .
# Set nvcc architecture # Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS # Enable CUDA
ENV LLAMA_CUBLAS=1 ENV LLAMA_CUDA=1
RUN make RUN make

View file

@ -89,8 +89,8 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF) option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUDA "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
endif() endif()
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON)
endif()
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17) cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit) find_package(CUDAToolkit)
if (CUDAToolkit_FOUND) if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found") message(STATUS "CUDA found")
enable_language(CUDA) enable_language(CUDA)
@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else() else()
message(WARNING "cuBLAS not found") message(WARNING "CUDA not found")
endif() endif()
endif() endif()
@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
if (LLAMA_HIP_UMA) if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA) add_compile_definitions(GGML_HIP_UMA)
@ -830,7 +835,7 @@ endif()
set(CUDA_CXX_FLAGS "") set(CUDA_CXX_FLAGS "")
if (LLAMA_CUBLAS) if (LLAMA_CUDA)
set(CUDA_FLAGS -use_fast_math) set(CUDA_FLAGS -use_fast_math)
if (LLAMA_FATAL_WARNINGS) if (LLAMA_FATAL_WARNINGS)
@ -1055,7 +1060,7 @@ endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>") add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>") add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
if (LLAMA_CUBLAS) if (LLAMA_CUDA)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")

View file

@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
endif # LLAMA_BLIS endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
LLAMA_CUDA := 1
endif
ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)') ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda CUDA_PATH ?= /opt/cuda
else else
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
endif endif
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@ -462,7 +467,7 @@ endif
ifdef JETSON_EOL_MODULE_DETECT ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
else else
define NVCC_COMPILE define NVCC_COMPILE
@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE) $(NVCC_COMPILE)
endif # LLAMA_CUBLAS endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2 LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
ifdef LLAMA_HIP_UMA ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA endif # LLAMA_HIP_UMA
@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
# identify CUDA host compiler # identify CUDA host compiler
ifdef LLAMA_CUBLAS ifdef LLAMA_CUDA
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS)) $(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS ifdef LLAMA_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
endif # CUDA_POWER_ARCH endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUBLAS endif # LLAMA_CUDA
$(info ) $(info )
ifdef LLAMA_CUBLAS
$(info !!!!)
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
$(info !!!!)
$(info )
endif
# #
# Build library # Build library
# #

View file

@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
- #### cuBLAS - #### CUDA
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
- Using `make`: - Using `make`:
```bash ```bash
make LLAMA_CUBLAS=1 make LLAMA_CUDA=1
``` ```
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_CUBLAS=ON cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release cmake --build . --config Release
``` ```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
<!---
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
--->
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------| |--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |

View file

@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi fi
if [ ! -z ${GG_BUILD_CUDA} ]; then if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
fi fi
if [ ! -z ${GG_BUILD_SYCL} ]; then if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models} python3 ../convert.py ${path_models}

View file

@ -48,12 +48,12 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
#define GGML_USE_CUBLAS_SYCL #define GGML_USE_CUDA_SYCL
#endif #endif
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
#define GGML_USE_CUBLAS_SYCL_VULKAN #define GGML_USE_CUDA_SYCL_VULKAN
#endif #endif
#if defined(LLAMA_USE_CURL) #if defined(LLAMA_USE_CURL)
@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true; return true;
} }
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#ifndef GGML_USE_CUBLAS_SYCL #ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL
return true; return true;
} }
if (arg == "--split-mode" || arg == "-sm") { if (arg == "--split-mode" || arg == "-sm") {
@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true; invalid_param = true;
return true; return true;
} }
#ifndef GGML_USE_CUBLAS_SYCL #ifndef GGML_USE_CUDA_SYCL
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL
return true; return true;
} }
if (arg == "--tensor-split" || arg == "-ts") { if (arg == "--tensor-split" || arg == "-ts") {
@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.tensor_split[i] = 0.0f; params.tensor_split[i] = 0.0f;
} }
} }
#ifndef GGML_USE_CUBLAS_SYCL_VULKAN #ifndef GGML_USE_CUDA_SYCL_VULKAN
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
#endif // GGML_USE_CUBLAS_SYCL #endif // GGML_USE_CUDA_SYCL_VULKAN
return true; return true;
} }
if (arg == "--no-mmap") { if (arg == "--no-mmap") {
@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");

View file

@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
## Example ## Example
```bash ```bash
LLAMA_CUBLAS=1 make -j LLAMA_CUDA=1 make -j
# generate importance matrix (imatrix.dat) # generate importance matrix (imatrix.dat)
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

View file

@ -113,7 +113,7 @@ static std::string get_cpu_info() {
static std::string get_gpu_info() { static std::string get_gpu_info() {
std::string id; std::string id;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count(); int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
char buf[128]; char buf[128];
@ -808,7 +808,7 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT; const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER; const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::kompute = !!ggml_cpu_has_kompute();

View file

@ -7,7 +7,7 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif #endif
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
} }
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); printf("%s: CLIP using CUDA backend\n", __func__);
#endif #endif

View file

@ -2510,15 +2510,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUDA
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
std::string arg_next = argv[i]; std::string arg_next = argv[i];
// split string by , and / // split string by , and /
@ -2535,17 +2535,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
} }
} }
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUDA
} else if (arg == "--main-gpu" || arg == "-mg") { } else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) #if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
#endif #endif
} else if (arg == "--lora") { } else if (arg == "--lora") {
if (++i >= argc) { if (++i >= argc) {

View file

@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
// add forward decls here to avoid including the backend headers // add forward decls here to avoid including the backend headers
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
extern GGML_CALL void ggml_backend_cuda_reg_devices(void); extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
ggml_backend_cuda_reg_devices(); ggml_backend_cuda_reg_devices();
#endif #endif

8
ggml.c
View file

@ -21674,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;
#endif #endif
} }
int ggml_cpu_has_cublas(void) { int ggml_cpu_has_cuda(void) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return 1; return 1;
#else #else
return 0; return 0;
@ -21722,7 +21722,7 @@ int ggml_cpu_has_sycl(void) {
} }
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl(); ggml_cpu_has_sycl();
} }

2
ggml.h
View file

@ -2354,7 +2354,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void); GGML_API int ggml_cpu_has_kompute (void);

View file

@ -7,7 +7,7 @@
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
# include "ggml-cuda.h" # include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
# include "ggml-opencl.h" # include "ggml-opencl.h"
@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
// host buffers should only be used when data is expected to be copied to/from the GPU // host buffers should only be used when data is expected to be copied to/from the GPU
if (host_buffer) { if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type(); buft = ggml_backend_cuda_host_buffer_type();
@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN) #elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu); buft = ggml_backend_vk_buffer_type(gpu);
@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) { if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split); buft = ggml_backend_cuda_split_buffer_type(tensor_split);
} }
@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
} }
static size_t llama_get_device_count() { static size_t llama_get_device_count() {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count(); return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count(); return ggml_backend_sycl_get_device_count();
@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() {
} }
static size_t llama_get_device_memory(int device) { static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUDA)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free); ggml_backend_cuda_get_device_memory(device, &total, &free);
@ -2080,7 +2080,7 @@ struct llama_model {
ggml_free(ctx); ggml_free(ctx);
} }
for (ggml_backend_buffer_t buf : bufs) { for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
} }
@ -5269,7 +5269,7 @@ static bool llm_load_tensors(
} }
model.bufs.push_back(buf); model.bufs.push_back(buf);
bufs.emplace(idx, buf); bufs.emplace(idx, buf);
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUDA
if (n_layer >= n_gpu_layers) { if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer( ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf), ggml_backend_buffer_get_base(buf),
@ -13371,7 +13371,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
size_t llama_max_devices(void) { size_t llama_max_devices(void) {
#if defined(GGML_USE_METAL) #if defined(GGML_USE_METAL)
return 1; return 1;
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES; return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
return GGML_SYCL_MAX_DEVICES; return GGML_SYCL_MAX_DEVICES;
@ -13391,8 +13391,8 @@ bool llama_supports_mlock(void) {
} }
bool llama_supports_gpu_offload(void) { bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true; return true;
#else #else
@ -13597,7 +13597,7 @@ struct llama_context * llama_new_context_with_model(
} }
ctx->backends.push_back(ctx->backend_metal); ctx->backends.push_back(ctx->backend_metal);
} }
#elif defined(GGML_USE_CUBLAS) #elif defined(GGML_USE_CUDA)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
@ -13744,7 +13744,7 @@ struct llama_context * llama_new_context_with_model(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
#ifndef GGML_USE_CUBLAS #ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events // pipeline parallelism requires support for async compute and events
// currently this is only implemented in the CUDA backend // currently this is only implemented in the CUDA backend
pipeline_parallel = false; pipeline_parallel = false;