cuda : rename build flag to LLAMA_CUDA
ggml-ci
This commit is contained in:
parent
ae1f211ce2
commit
4a60e88065
17 changed files with 91 additions and 77 deletions
|
@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
|
||||||
%setup -n llama.cpp-master
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j LLAMA_CUBLAS=1
|
make -j LLAMA_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
|
|
@ -20,8 +20,8 @@ COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable cuBLAS
|
# Enable CUDA
|
||||||
ENV LLAMA_CUBLAS=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make
|
||||||
|
|
||||||
|
|
|
@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
(cmakeBool "LLAMA_BLAS" useBlas)
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
||||||
(cmakeBool "LLAMA_CUBLAS" useCuda)
|
(cmakeBool "LLAMA_CUDA" useCuda)
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
(cmakeBool "LLAMA_MPI" useMpi)
|
||||||
|
|
|
@ -20,8 +20,8 @@ COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable cuBLAS
|
# Enable CUDA
|
||||||
ENV LLAMA_CUBLAS=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make
|
||||||
|
|
||||||
|
|
|
@ -89,8 +89,8 @@ endif()
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
|
@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUBLAS)
|
||||||
|
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
|
||||||
|
set(LLAMA_CUDA ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_CUDA)
|
||||||
cmake_minimum_required(VERSION 3.17)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
|
||||||
find_package(CUDAToolkit)
|
find_package(CUDAToolkit)
|
||||||
if (CUDAToolkit_FOUND)
|
if (CUDAToolkit_FOUND)
|
||||||
message(STATUS "cuBLAS found")
|
message(STATUS "CUDA found")
|
||||||
|
|
||||||
enable_language(CUDA)
|
enable_language(CUDA)
|
||||||
|
|
||||||
|
@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
|
||||||
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "cuBLAS not found")
|
message(WARNING "CUDA not found")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
|
||||||
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
|
||||||
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
|
||||||
|
|
||||||
if (LLAMA_HIP_UMA)
|
if (LLAMA_HIP_UMA)
|
||||||
add_compile_definitions(GGML_HIP_UMA)
|
add_compile_definitions(GGML_HIP_UMA)
|
||||||
|
@ -830,7 +835,7 @@ endif()
|
||||||
|
|
||||||
set(CUDA_CXX_FLAGS "")
|
set(CUDA_CXX_FLAGS "")
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUDA)
|
||||||
set(CUDA_FLAGS -use_fast_math)
|
set(CUDA_FLAGS -use_fast_math)
|
||||||
|
|
||||||
if (LLAMA_FATAL_WARNINGS)
|
if (LLAMA_FATAL_WARNINGS)
|
||||||
|
@ -1055,7 +1060,7 @@ endif()
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||||
|
|
||||||
if (LLAMA_CUBLAS)
|
if (LLAMA_CUDA)
|
||||||
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||||
|
|
26
Makefile
26
Makefile
|
@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
|
||||||
endif # LLAMA_BLIS
|
endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
|
# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
|
||||||
|
LLAMA_CUDA := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_CUDA
|
||||||
ifneq ('', '$(wildcard /opt/cuda)')
|
ifneq ('', '$(wildcard /opt/cuda)')
|
||||||
CUDA_PATH ?= /opt/cuda
|
CUDA_PATH ?= /opt/cuda
|
||||||
else
|
else
|
||||||
CUDA_PATH ?= /usr/local/cuda
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
endif
|
endif
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
|
@ -462,7 +467,7 @@ endif
|
||||||
|
|
||||||
ifdef JETSON_EOL_MODULE_DETECT
|
ifdef JETSON_EOL_MODULE_DETECT
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
else
|
else
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
|
@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
||||||
$(NVCC_COMPILE)
|
$(NVCC_COMPILE)
|
||||||
|
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUDA
|
||||||
|
|
||||||
ifdef LLAMA_CLBLAST
|
ifdef LLAMA_CLBLAST
|
||||||
|
|
||||||
|
@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
|
||||||
LLAMA_CUDA_DMMV_X ?= 32
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
LLAMA_CUDA_MMV_Y ?= 1
|
LLAMA_CUDA_MMV_Y ?= 1
|
||||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||||
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
|
||||||
ifdef LLAMA_HIP_UMA
|
ifdef LLAMA_HIP_UMA
|
||||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||||
endif # LLAMA_HIP_UMA
|
endif # LLAMA_HIP_UMA
|
||||||
|
@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
||||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||||
|
|
||||||
# identify CUDA host compiler
|
# identify CUDA host compiler
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUDA
|
||||||
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
||||||
include scripts/get-flags.mk
|
include scripts/get-flags.mk
|
||||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||||
|
@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
|
||||||
$(info I LDFLAGS: $(LDFLAGS))
|
$(info I LDFLAGS: $(LDFLAGS))
|
||||||
$(info I CC: $(shell $(CC) --version | head -n 1))
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
||||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUDA
|
||||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
|
@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
|
||||||
endif # CUDA_POWER_ARCH
|
endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUDA
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
|
ifdef LLAMA_CUBLAS
|
||||||
|
$(info !!!!)
|
||||||
|
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
|
||||||
|
$(info !!!!)
|
||||||
|
$(info )
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
11
README.md
11
README.md
|
@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||||
|
|
||||||
- #### cuBLAS
|
- #### CUDA
|
||||||
|
|
||||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
|
For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_CUBLAS=1
|
make LLAMA_CUDA=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_CUBLAS=ON
|
cmake .. -DLLAMA_CUDA=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
<!---
|
|
||||||
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
|
|
||||||
--->
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-------------|
|
|--------------------------------|------------------------|---------|-------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
|
|
|
@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models}
|
python3 ../convert.py ${path_models}
|
||||||
|
|
||||||
|
|
|
@ -48,12 +48,12 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
|
||||||
#define GGML_USE_CUBLAS_SYCL
|
#define GGML_USE_CUDA_SYCL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
|
||||||
#define GGML_USE_CUBLAS_SYCL_VULKAN
|
#define GGML_USE_CUDA_SYCL_VULKAN
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(LLAMA_USE_CURL)
|
||||||
|
@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL
|
#ifndef GGML_USE_CUDA_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--split-mode" || arg == "-sm") {
|
if (arg == "--split-mode" || arg == "-sm") {
|
||||||
|
@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL
|
#ifndef GGML_USE_CUDA_SYCL
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--tensor-split" || arg == "-ts") {
|
if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
|
@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.tensor_split[i] = 0.0f;
|
params.tensor_split[i] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
|
#ifndef GGML_USE_CUDA_SYCL_VULKAN
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS_SYCL
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--no-mmap") {
|
if (arg == "--no-mmap") {
|
||||||
|
@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||||
|
|
|
@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_CUBLAS=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
|
@ -113,7 +113,7 @@ static std::string get_cpu_info() {
|
||||||
|
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::string id;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
int count = ggml_backend_cuda_get_device_count();
|
int count = ggml_backend_cuda_get_device_count();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char buf[128];
|
char buf[128];
|
||||||
|
@ -808,7 +808,7 @@ struct test {
|
||||||
|
|
||||||
const std::string test::build_commit = LLAMA_COMMIT;
|
const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
const bool test::cuda = !!ggml_cpu_has_cuda();
|
||||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
new_clip->backend = ggml_backend_cuda_init(0);
|
||||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -2510,15 +2510,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUDA
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
// split string by , and /
|
// split string by , and /
|
||||||
|
@ -2535,17 +2535,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUDA
|
||||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
|
LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--lora") {
|
} else if (arg == "--lora") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
|
|
@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
||||||
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
||||||
|
|
||||||
// add forward decls here to avoid including the backend headers
|
// add forward decls here to avoid including the backend headers
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
||||||
ggml_backend_cuda_reg_devices();
|
ggml_backend_cuda_reg_devices();
|
||||||
#endif
|
#endif
|
||||||
|
|
8
ggml.c
8
ggml.c
|
@ -21674,15 +21674,15 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_cublas(void) {
|
int ggml_cpu_has_cuda(void) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -21722,7 +21722,7 @@ int ggml_cpu_has_sycl(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||||
ggml_cpu_has_sycl();
|
ggml_cpu_has_sycl();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -2354,7 +2354,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cublas (void);
|
GGML_API int ggml_cpu_has_cuda (void);
|
||||||
GGML_API int ggml_cpu_has_clblast (void);
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
GGML_API int ggml_cpu_has_vulkan (void);
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_kompute (void);
|
GGML_API int ggml_cpu_has_kompute (void);
|
||||||
|
|
26
llama.cpp
26
llama.cpp
|
@ -7,7 +7,7 @@
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
# include "ggml-opencl.h"
|
# include "ggml-opencl.h"
|
||||||
|
@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
// host buffers should only be used when data is expected to be copied to/from the GPU
|
// host buffers should only be used when data is expected to be copied to/from the GPU
|
||||||
if (host_buffer) {
|
if (host_buffer) {
|
||||||
buft = ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
|
@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
buft = ggml_backend_vk_buffer_type(gpu);
|
buft = ggml_backend_vk_buffer_type(gpu);
|
||||||
|
@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||||
}
|
}
|
||||||
|
@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_count() {
|
static size_t llama_get_device_count() {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
return ggml_backend_cuda_get_device_count();
|
return ggml_backend_cuda_get_device_count();
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
return ggml_backend_sycl_get_device_count();
|
return ggml_backend_sycl_get_device_count();
|
||||||
|
@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_memory(int device) {
|
static size_t llama_get_device_memory(int device) {
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
||||||
|
@ -2080,7 +2080,7 @@ struct llama_model {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
for (ggml_backend_buffer_t buf : bufs) {
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||||
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
||||||
}
|
}
|
||||||
|
@ -5269,7 +5269,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
model.bufs.push_back(buf);
|
model.bufs.push_back(buf);
|
||||||
bufs.emplace(idx, buf);
|
bufs.emplace(idx, buf);
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUDA
|
||||||
if (n_layer >= n_gpu_layers) {
|
if (n_layer >= n_gpu_layers) {
|
||||||
ggml_backend_cuda_register_host_buffer(
|
ggml_backend_cuda_register_host_buffer(
|
||||||
ggml_backend_buffer_get_base(buf),
|
ggml_backend_buffer_get_base(buf),
|
||||||
|
@ -13371,7 +13371,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
size_t llama_max_devices(void) {
|
size_t llama_max_devices(void) {
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
return 1;
|
return 1;
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
return GGML_CUDA_MAX_DEVICES;
|
return GGML_CUDA_MAX_DEVICES;
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
return GGML_SYCL_MAX_DEVICES;
|
return GGML_SYCL_MAX_DEVICES;
|
||||||
|
@ -13391,8 +13391,8 @@ bool llama_supports_mlock(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
|
@ -13597,7 +13597,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUDA)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
|
@ -13744,7 +13744,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||||
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
||||||
#ifndef GGML_USE_CUBLAS
|
#ifndef GGML_USE_CUDA
|
||||||
// pipeline parallelism requires support for async compute and events
|
// pipeline parallelism requires support for async compute and events
|
||||||
// currently this is only implemented in the CUDA backend
|
// currently this is only implemented in the CUDA backend
|
||||||
pipeline_parallel = false;
|
pipeline_parallel = false;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue