Merge 346f64f0d8
into 0fd93cdef5
This commit is contained in:
commit
5d755b8e5c
23 changed files with 2645 additions and 25 deletions
|
@ -18,16 +18,19 @@
|
|||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
curl,
|
||||
clblast,
|
||||
shaderc,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
] && blas.meta.available,
|
||||
useCuda ? config.cudaSupport,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||
useOpenCL ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
|
@ -56,6 +59,7 @@ let
|
|||
++ lib.optionals useCuda [ "CUDA" ]
|
||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||
++ lib.optionals useMpi [ "MPI" ]
|
||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||
++ lib.optionals useRocm [ "ROCm" ]
|
||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||
|
||||
|
@ -207,6 +211,7 @@ effectiveStdenv.mkDerivation (
|
|||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||
++ optionals useCuda cudaBuildInputs
|
||||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useOpenCL [ clblast ]
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useBlas [ blas ]
|
||||
++ optionals useVulkan vulkanBuildInputs
|
||||
|
@ -220,6 +225,7 @@ effectiveStdenv.mkDerivation (
|
|||
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
(cmakeBool "GGML_BLAS" useBlas)
|
||||
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||
(cmakeBool "GGML_CUDA" useCuda)
|
||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||
(cmakeBool "GGML_METAL" useMetalKit)
|
||||
|
@ -263,6 +269,7 @@ effectiveStdenv.mkDerivation (
|
|||
useCuda
|
||||
useMetalKit
|
||||
useMpi
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
;
|
||||
|
@ -289,7 +296,7 @@ effectiveStdenv.mkDerivation (
|
|||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
|
|
36
.github/workflows/build.yml
vendored
36
.github/workflows/build.yml
vendored
|
@ -690,6 +690,8 @@ jobs:
|
|||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
|
||||
|
@ -706,6 +708,8 @@ jobs:
|
|||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute-x64'
|
||||
|
@ -730,6 +734,27 @@ jobs:
|
|||
run: |
|
||||
git submodule update --init ggml/src/kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
||||
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
||||
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
||||
$txt = Get-Content -Path $f -Raw
|
||||
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
||||
}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
|
@ -763,6 +788,13 @@ jobs:
|
|||
cmake -S . -B build ${{ matrix.defines }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
|
@ -786,7 +818,7 @@ jobs:
|
|||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
@ -1044,7 +1076,7 @@ jobs:
|
|||
# hypervisor: 'qemu'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||
|
||||
release:
|
||||
|
|
|
@ -42,6 +42,10 @@ endif()
|
|||
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
|
||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
option(GGML_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
@ -102,6 +106,7 @@ llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
|||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_CLBLAST GGML_CLBLAST)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
|
@ -165,6 +170,16 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
|||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
install(TARGETS ggml PUBLIC_HEADER)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
install(
|
||||
FILES convert_hf_to_gguf.py
|
||||
PERMISSIONS
|
||||
|
|
17
Makefile
17
Makefile
|
@ -746,6 +746,23 @@ ggml/src/ggml-cuda.o: \
|
|||
$(NVCC_COMPILE)
|
||||
endif # GGML_CUDA
|
||||
|
||||
ifdef LLAMA_CLBLAST
|
||||
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||
|
||||
# Mac provides OpenCL as a framework
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_LDFLAGS += -lclblast -framework OpenCL
|
||||
else
|
||||
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||
endif
|
||||
OBJS += ggml-opencl.o
|
||||
|
||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef GGML_VULKAN
|
||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
||||
|
|
|
@ -49,7 +49,7 @@ variety of hardware - locally and in the cloud.
|
|||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||
- Vulkan and SYCL backend support
|
||||
- Vulkan, SYCL, and (partial) OpenCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||
|
|
|
@ -6,6 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
|||
set(GGML_BLAS @GGML_BLAS@)
|
||||
set(GGML_CUDA @GGML_CUDA@)
|
||||
set(GGML_METAL @GGML_METAL@)
|
||||
set(GGML_CLBLAST @GGML_CLBLAST@)
|
||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||
set(GGML_VULKAN @GGML_VULKAN@)
|
||||
|
@ -44,6 +45,11 @@ if (GGML_METAL)
|
|||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
endif()
|
||||
|
||||
if (GGML_CLBLAST)
|
||||
find_package(CLBlast REQUIRED)
|
||||
endif()
|
||||
|
||||
|
||||
if (GGML_VULKAN)
|
||||
find_package(Vulkan REQUIRED)
|
||||
endif()
|
||||
|
|
|
@ -3128,6 +3128,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
|
|
|
@ -30,7 +30,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
|
|||
|
||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||
|
||||
## Recommended Release
|
||||
|
||||
|
|
|
@ -162,7 +162,7 @@ $ ./llama-bench -o csv
|
|||
```
|
||||
|
||||
```csv
|
||||
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||
```
|
||||
|
@ -179,6 +179,7 @@ $ ./llama-bench -o json
|
|||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
|
@ -209,6 +210,7 @@ $ ./llama-bench -o json
|
|||
"build_commit": "3469684",
|
||||
"build_number": 1275,
|
||||
"cuda": true,
|
||||
"opencl": false,
|
||||
"metal": false,
|
||||
"gpu_blas": true,
|
||||
"blas": true,
|
||||
|
@ -251,6 +253,7 @@ CREATE TABLE IF NOT EXISTS test (
|
|||
build_commit TEXT,
|
||||
build_number INTEGER,
|
||||
cuda INTEGER,
|
||||
opencl INTEGER,
|
||||
metal INTEGER,
|
||||
gpu_blas INTEGER,
|
||||
blas INTEGER,
|
||||
|
@ -276,6 +279,6 @@ CREATE TABLE IF NOT EXISTS test (
|
|||
stddev_ts REAL
|
||||
);
|
||||
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||
```
|
||||
|
|
|
@ -754,6 +754,7 @@ struct test {
|
|||
static const std::string build_commit;
|
||||
static const int build_number;
|
||||
static const bool cuda;
|
||||
static const bool opencl;
|
||||
static const bool vulkan;
|
||||
static const bool kompute;
|
||||
static const bool metal;
|
||||
|
@ -843,6 +844,9 @@ struct test {
|
|||
if (cuda) {
|
||||
return GGML_CUDA_NAME;
|
||||
}
|
||||
if (opencl) {
|
||||
return "OpenCL";
|
||||
}
|
||||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
|
@ -868,7 +872,7 @@ struct test {
|
|||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_ubatch",
|
||||
|
@ -894,7 +898,7 @@ struct test {
|
|||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||
return BOOL;
|
||||
|
@ -924,7 +928,7 @@ struct test {
|
|||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||
std::to_string(metal), std::to_string(opencl), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||
|
@ -952,6 +956,7 @@ struct test {
|
|||
const std::string test::build_commit = LLAMA_COMMIT;
|
||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cuda();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
|
|
|
@ -8,14 +8,16 @@ Because this example is "outside of the source tree", it is important to first b
|
|||
|
||||
### Considerations
|
||||
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
|
||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||
|
||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||
|
||||
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||
|
||||
```cmd
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/LlamaCPP
|
||||
```
|
||||
|
@ -25,7 +27,7 @@ cmake --install build --prefix C:/LlamaCPP
|
|||
|
||||
```cmd
|
||||
cd ..\examples\main-cmake-pkg
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix C:/MyLlamaApp
|
||||
```
|
||||
|
|
|
@ -159,6 +159,7 @@
|
|||
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
|
|
36
ggml/include/ggml-opencl.h
Normal file
36
ggml/include/ggml-opencl.h
Normal file
|
@ -0,0 +1,36 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||
// GGML_API void ggml_cl_host_free(void * ptr);
|
||||
|
||||
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||
|
||||
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||
|
||||
// backend API
|
||||
|
||||
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||
|
||||
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -2397,6 +2397,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_API int ggml_cpu_has_blas (void);
|
||||
GGML_API int ggml_cpu_has_cuda (void);
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_vulkan (void);
|
||||
GGML_API int ggml_cpu_has_kompute (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
|
|
|
@ -507,6 +507,23 @@ if (GGML_HIPBLAS)
|
|||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
message(STATUS "CLBlast found")
|
||||
|
||||
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||
|
||||
add_compile_definitions(GGML_USE_CLBLAST)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
|
||||
else()
|
||||
message(WARNING "CLBlast not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (GGML_SYCL)
|
||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
|
||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
||||
|
|
2305
ggml/src/ggml-opencl.cpp
Normal file
2305
ggml/src/ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load diff
157
ggml/src/ggml.c
157
ggml/src/ggml.c
|
@ -378,6 +378,17 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
#elif defined(GGML_USE_OPENBLAS)
|
||||
#if defined(GGML_BLAS_USE_MKL)
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
// floating point type used to accumulate sums
|
||||
|
@ -3516,6 +3527,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#endif
|
||||
is_first_call = false;
|
||||
}
|
||||
|
||||
|
@ -9126,6 +9140,17 @@ static void ggml_compute_forward_add_f32(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_add(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
@ -10190,6 +10215,17 @@ static void ggml_compute_forward_mul_f32(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_mul(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
@ -12315,6 +12351,82 @@ static void ggml_compute_forward_mul_mat(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||
const int64_t ne_plane = ne01*ne00;
|
||||
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
||||
UNUSED(desired_wsize);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
if (type != GGML_TYPE_F32) {
|
||||
assert(params->wsize >= desired_wsize);
|
||||
// parallelize by src0 rows
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// perform sgemm, parallelization controlled by blas lib
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//const int64_t tgemm0 = ggml_perf_time_us();
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne1, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
||||
|
||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
|
@ -12731,6 +12843,22 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
bool use_blas = ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
||||
#endif
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
||||
if (use_blas) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ith == 0) {
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
}
|
||||
|
@ -18733,6 +18861,22 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||
{
|
||||
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
||||
} else
|
||||
#endif
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||
// here we need memory for fully dequantized matrix from src0
|
||||
// take into account that src0 can be broadcasted into src1[2,3]
|
||||
cur = ggml_type_size(GGML_TYPE_F32)
|
||||
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
||||
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (node->src[1]->type != vec_dot_type) {
|
||||
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
||||
}
|
||||
|
@ -22022,7 +22166,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||
}
|
||||
|
||||
int ggml_cpu_has_blas(void) {
|
||||
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
|
@ -22037,6 +22181,14 @@ int ggml_cpu_has_cuda(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_clblast(void) {
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_vulkan(void) {
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
return 1;
|
||||
|
@ -22086,7 +22238,8 @@ int ggml_cpu_has_llamafile(void) {
|
|||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
||||
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||
ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
|
|
@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
|
|||
|
||||
# Properties by which to differentiate results per commit:
|
||||
KEY_PROPERTIES = [
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||
]
|
||||
|
||||
# Properties that are boolean and are converted to Yes/No for the table:
|
||||
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||
|
||||
# Header names for the table:
|
||||
PRETTY_NAMES = {
|
||||
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Helper script for deploying llama.cpp server with a single Bash command
|
||||
#
|
||||
# - Works on Linux and macOS
|
||||
# - Supports: CPU, CUDA, Metal
|
||||
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||
# - Can run all GGUF models from HuggingFace
|
||||
# - Can serve requests in parallel
|
||||
# - Always builds latest llama.cpp from GitHub
|
||||
|
@ -19,7 +19,7 @@
|
|||
# --port: port number, default is 8888
|
||||
# --repo: path to a repo containing GGUF model files
|
||||
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||
# --backend: cpu, cuda, metal, depends on the OS
|
||||
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||
# --gpu-id: gpu id, default is 0
|
||||
# --n-parallel: number of parallel requests, default is 8
|
||||
# --n-kv: KV cache size, default is 4096
|
||||
|
@ -72,7 +72,7 @@ function print_usage {
|
|||
printf " --port: port number, default is 8888\n"
|
||||
printf " --repo: path to a repo containing GGUF model files\n"
|
||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||
printf " --backend: cpu, cuda, metal, depends on the OS\n"
|
||||
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||
printf " --gpu-id: gpu id, default is 0\n"
|
||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||
printf " --n-kv: KV cache size, default is 4096\n"
|
||||
|
@ -387,6 +387,9 @@ elif [[ "$backend" == "cpu" ]]; then
|
|||
elif [[ "$backend" == "metal" ]]; then
|
||||
printf "[+] Building with Metal backend\n"
|
||||
make -j llama-server $log
|
||||
elif [[ "$backend" == "opencl" ]]; then
|
||||
printf "[+] Building with OpenCL backend\n"
|
||||
LLAMA_CLBLAST=1 make -j llama-server $log
|
||||
else
|
||||
printf "[-] Unknown backend: %s\n" "$backend"
|
||||
exit 1
|
||||
|
@ -404,6 +407,8 @@ elif [[ "$backend" == "cpu" ]]; then
|
|||
args="-ngl 0"
|
||||
elif [[ "$backend" == "metal" ]]; then
|
||||
args="-ngl 999"
|
||||
elif [[ "$backend" == "opencl" ]]; then
|
||||
args="-ngl 999"
|
||||
else
|
||||
printf "[-] Unknown backend: %s\n" "$backend"
|
||||
exit 1
|
||||
|
|
|
@ -117,6 +117,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
# src/ggml-impl.h -> ggml/src/ggml-impl.h
|
||||
# src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp
|
||||
# src/ggml-metal.m -> ggml/src/ggml-metal.m
|
||||
# src/ggml-opencl.cpp -> ggml/src/ggml-opencl.cpp
|
||||
# src/ggml-opencl.h -> ggml/src/ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml/src/ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml/src/ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
|
||||
|
@ -164,6 +166,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
-e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-opencl\.cpp/\1ggml\/src\/ggml-opencl.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
|
||||
|
@ -179,6 +182,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-opencl\.h/\1ggml\/include\/ggml-opencl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
|
||||
|
|
|
@ -19,6 +19,7 @@ cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h
|
|||
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp
|
||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m
|
||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
|
||||
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml/src/ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
||||
|
@ -35,6 +36,7 @@ cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h
|
|||
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
|
||||
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
|
||||
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
|
||||
cp -rpv ../ggml/include/ggml-opencl.h ./ggml/include/ggml-opencl.h
|
||||
cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h
|
||||
cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h
|
||||
cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
|
||||
#ifdef GGML_USE_CUDA
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
# include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
|
@ -2840,6 +2842,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
|
@ -2960,6 +2964,10 @@ static bool llama_kv_cache_init(
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
offload = false;
|
||||
#endif
|
||||
|
||||
// count used buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
if (offload) {
|
||||
|
@ -16502,7 +16510,7 @@ bool llama_supports_mlock(void) {
|
|||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue