Manually adjusted. This reverts commit554c247caf
. This reverts commit257f8e41e2
. Signed-off-by: David Heidelberg <david@ixit.cz>
This commit is contained in:
parent
8cd1bcfd3f
commit
346f64f0d8
23 changed files with 2645 additions and 25 deletions
|
@ -18,16 +18,19 @@
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
curl,
|
||||||
|
clblast,
|
||||||
shaderc,
|
shaderc,
|
||||||
useBlas ? builtins.all (x: !x) [
|
useBlas ? builtins.all (x: !x) [
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
|
useOpenCL
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
] && blas.meta.available,
|
] && blas.meta.available,
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
|
useOpenCL ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
enableCurl ? true,
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
|
@ -56,6 +59,7 @@ let
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
++ lib.optionals useCuda [ "CUDA" ]
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
++ lib.optionals useMpi [ "MPI" ]
|
||||||
|
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||||
++ lib.optionals useRocm [ "ROCm" ]
|
++ lib.optionals useRocm [ "ROCm" ]
|
||||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||||
|
|
||||||
|
@ -207,6 +211,7 @@ effectiveStdenv.mkDerivation (
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
++ optionals useCuda cudaBuildInputs
|
++ optionals useCuda cudaBuildInputs
|
||||||
++ optionals useMpi [ mpi ]
|
++ optionals useMpi [ mpi ]
|
||||||
|
++ optionals useOpenCL [ clblast ]
|
||||||
++ optionals useRocm rocmBuildInputs
|
++ optionals useRocm rocmBuildInputs
|
||||||
++ optionals useBlas [ blas ]
|
++ optionals useBlas [ blas ]
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
++ optionals useVulkan vulkanBuildInputs
|
||||||
|
@ -220,6 +225,7 @@ effectiveStdenv.mkDerivation (
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
|
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
|
@ -263,6 +269,7 @@ effectiveStdenv.mkDerivation (
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
useMpi
|
useMpi
|
||||||
|
useOpenCL
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
;
|
;
|
||||||
|
@ -289,7 +296,7 @@ effectiveStdenv.mkDerivation (
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# Configurations that are known to result in build failures. Can be
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
|
|
36
.github/workflows/build.yml
vendored
36
.github/workflows/build.yml
vendored
|
@ -690,6 +690,8 @@ jobs:
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
|
OPENCL_VERSION: 2023.04.17
|
||||||
|
CLBLAST_VERSION: 1.6.0
|
||||||
SDE_VERSION: 9.33.0-2024-01-07
|
SDE_VERSION: 9.33.0-2024-01-07
|
||||||
VULKAN_VERSION: 1.3.261.1
|
VULKAN_VERSION: 1.3.261.1
|
||||||
|
|
||||||
|
@ -706,6 +708,8 @@ jobs:
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
- build: 'clblast-x64'
|
||||||
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
|
@ -730,6 +734,27 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init ggml/src/kompute
|
git submodule update --init ggml/src/kompute
|
||||||
|
|
||||||
|
- name: Download OpenCL SDK
|
||||||
|
id: get_opencl
|
||||||
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||||
|
mkdir $env:RUNNER_TEMP/opencl
|
||||||
|
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
|
||||||
|
|
||||||
|
- name: Download CLBlast
|
||||||
|
id: get_clblast
|
||||||
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||||
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
|
||||||
|
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
|
||||||
|
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
|
||||||
|
$txt = Get-Content -Path $f -Raw
|
||||||
|
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
|
||||||
|
}
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
if: ${{ matrix.build == 'openblas-x64' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
|
@ -763,6 +788,13 @@ jobs:
|
||||||
cmake -S . -B build ${{ matrix.defines }}
|
cmake -S . -B build ${{ matrix.defines }}
|
||||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
|
- name: Add clblast.dll
|
||||||
|
id: add_clblast_dll
|
||||||
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
|
run: |
|
||||||
|
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||||
|
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
if: ${{ matrix.build == 'openblas-x64' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
|
@ -786,7 +818,7 @@ jobs:
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
@ -1044,7 +1076,7 @@ jobs:
|
||||||
# hypervisor: 'qemu'
|
# hypervisor: 'qemu'
|
||||||
# run: |
|
# run: |
|
||||||
# sudo pkg update
|
# sudo pkg update
|
||||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||||
|
|
||||||
release:
|
release:
|
||||||
|
|
|
@ -42,6 +42,10 @@ endif()
|
||||||
|
|
||||||
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||||
|
|
||||||
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
|
option(GGML_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
|
@ -102,6 +106,7 @@ llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
||||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||||
|
llama_option_depr(WARNING LLAMA_CLBLAST GGML_CLBLAST)
|
||||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||||
|
@ -165,6 +170,16 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
|
||||||
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
|
||||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
|
||||||
|
|
||||||
|
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||||
|
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||||
|
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
|
||||||
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
|
||||||
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert_hf_to_gguf.py
|
FILES convert_hf_to_gguf.py
|
||||||
PERMISSIONS
|
PERMISSIONS
|
||||||
|
|
17
Makefile
17
Makefile
|
@ -746,6 +746,23 @@ ggml/src/ggml-cuda.o: \
|
||||||
$(NVCC_COMPILE)
|
$(NVCC_COMPILE)
|
||||||
endif # GGML_CUDA
|
endif # GGML_CUDA
|
||||||
|
|
||||||
|
ifdef LLAMA_CLBLAST
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
||||||
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||||
|
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
||||||
|
|
||||||
|
# Mac provides OpenCL as a framework
|
||||||
|
ifeq ($(UNAME_S),Darwin)
|
||||||
|
MK_LDFLAGS += -lclblast -framework OpenCL
|
||||||
|
else
|
||||||
|
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||||
|
endif
|
||||||
|
OBJS += ggml-opencl.o
|
||||||
|
|
||||||
|
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
endif # LLAMA_CLBLAST
|
||||||
|
|
||||||
ifdef GGML_VULKAN
|
ifdef GGML_VULKAN
|
||||||
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
||||||
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
|
||||||
|
|
|
@ -49,7 +49,7 @@ variety of hardware - locally and in the cloud.
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan, SYCL, and (partial) OpenCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
|
||||||
|
|
|
@ -6,6 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
set(GGML_BLAS @GGML_BLAS@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(GGML_CUDA @GGML_CUDA@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
set(GGML_METAL @GGML_METAL@)
|
set(GGML_METAL @GGML_METAL@)
|
||||||
|
set(GGML_CLBLAST @GGML_CLBLAST@)
|
||||||
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||||
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
set(GGML_VULKAN @GGML_VULKAN@)
|
set(GGML_VULKAN @GGML_VULKAN@)
|
||||||
|
@ -44,6 +45,11 @@ if (GGML_METAL)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CLBLAST)
|
||||||
|
find_package(CLBlast REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if (GGML_VULKAN)
|
if (GGML_VULKAN)
|
||||||
find_package(Vulkan REQUIRED)
|
find_package(Vulkan REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -3128,6 +3128,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||||
|
|
|
@ -30,7 +30,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
|
||||||
|
|
||||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
|
|
|
@ -162,7 +162,7 @@ $ ./llama-bench -o csv
|
||||||
```
|
```
|
||||||
|
|
||||||
```csv
|
```csv
|
||||||
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
|
||||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
|
||||||
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
|
||||||
```
|
```
|
||||||
|
@ -179,6 +179,7 @@ $ ./llama-bench -o json
|
||||||
"build_commit": "3469684",
|
"build_commit": "3469684",
|
||||||
"build_number": 1275,
|
"build_number": 1275,
|
||||||
"cuda": true,
|
"cuda": true,
|
||||||
|
"opencl": false,
|
||||||
"metal": false,
|
"metal": false,
|
||||||
"gpu_blas": true,
|
"gpu_blas": true,
|
||||||
"blas": true,
|
"blas": true,
|
||||||
|
@ -209,6 +210,7 @@ $ ./llama-bench -o json
|
||||||
"build_commit": "3469684",
|
"build_commit": "3469684",
|
||||||
"build_number": 1275,
|
"build_number": 1275,
|
||||||
"cuda": true,
|
"cuda": true,
|
||||||
|
"opencl": false,
|
||||||
"metal": false,
|
"metal": false,
|
||||||
"gpu_blas": true,
|
"gpu_blas": true,
|
||||||
"blas": true,
|
"blas": true,
|
||||||
|
@ -251,6 +253,7 @@ CREATE TABLE IF NOT EXISTS test (
|
||||||
build_commit TEXT,
|
build_commit TEXT,
|
||||||
build_number INTEGER,
|
build_number INTEGER,
|
||||||
cuda INTEGER,
|
cuda INTEGER,
|
||||||
|
opencl INTEGER,
|
||||||
metal INTEGER,
|
metal INTEGER,
|
||||||
gpu_blas INTEGER,
|
gpu_blas INTEGER,
|
||||||
blas INTEGER,
|
blas INTEGER,
|
||||||
|
@ -276,6 +279,6 @@ CREATE TABLE IF NOT EXISTS test (
|
||||||
stddev_ts REAL
|
stddev_ts REAL
|
||||||
);
|
);
|
||||||
|
|
||||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
|
||||||
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
|
||||||
```
|
```
|
||||||
|
|
|
@ -754,6 +754,7 @@ struct test {
|
||||||
static const std::string build_commit;
|
static const std::string build_commit;
|
||||||
static const int build_number;
|
static const int build_number;
|
||||||
static const bool cuda;
|
static const bool cuda;
|
||||||
|
static const bool opencl;
|
||||||
static const bool vulkan;
|
static const bool vulkan;
|
||||||
static const bool kompute;
|
static const bool kompute;
|
||||||
static const bool metal;
|
static const bool metal;
|
||||||
|
@ -843,6 +844,9 @@ struct test {
|
||||||
if (cuda) {
|
if (cuda) {
|
||||||
return GGML_CUDA_NAME;
|
return GGML_CUDA_NAME;
|
||||||
}
|
}
|
||||||
|
if (opencl) {
|
||||||
|
return "OpenCL";
|
||||||
|
}
|
||||||
if (vulkan) {
|
if (vulkan) {
|
||||||
return "Vulkan";
|
return "Vulkan";
|
||||||
}
|
}
|
||||||
|
@ -868,7 +872,7 @@ struct test {
|
||||||
static const std::vector<std::string> & get_fields() {
|
static const std::vector<std::string> & get_fields() {
|
||||||
static const std::vector<std::string> fields = {
|
static const std::vector<std::string> fields = {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
|
@ -894,7 +898,7 @@ struct test {
|
||||||
field == "avg_ns" || field == "stddev_ns") {
|
field == "avg_ns" || field == "stddev_ns") {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
|
@ -924,7 +928,7 @@ struct test {
|
||||||
std::vector<std::string> values = {
|
std::vector<std::string> values = {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
|
||||||
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(metal), std::to_string(opencl), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
|
@ -952,6 +956,7 @@ struct test {
|
||||||
const std::string test::build_commit = LLAMA_COMMIT;
|
const std::string test::build_commit = LLAMA_COMMIT;
|
||||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||||
const bool test::cuda = !!ggml_cpu_has_cuda();
|
const bool test::cuda = !!ggml_cpu_has_cuda();
|
||||||
|
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||||
const bool test::metal = !!ggml_cpu_has_metal();
|
const bool test::metal = !!ggml_cpu_has_metal();
|
||||||
|
|
|
@ -8,14 +8,16 @@ Because this example is "outside of the source tree", it is important to first b
|
||||||
|
|
||||||
### Considerations
|
### Considerations
|
||||||
|
|
||||||
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
|
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
|
||||||
|
|
||||||
### Build llama.cpp and install to C:\LlamaCPP directory
|
### Build llama.cpp and install to C:\LlamaCPP directory
|
||||||
|
|
||||||
|
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
|
||||||
|
|
||||||
```cmd
|
```cmd
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
cd llama.cpp
|
cd llama.cpp
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
|
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
cmake --install build --prefix C:/LlamaCPP
|
cmake --install build --prefix C:/LlamaCPP
|
||||||
```
|
```
|
||||||
|
@ -25,7 +27,7 @@ cmake --install build --prefix C:/LlamaCPP
|
||||||
|
|
||||||
```cmd
|
```cmd
|
||||||
cd ..\examples\main-cmake-pkg
|
cd ..\examples\main-cmake-pkg
|
||||||
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
cmake --install build --prefix C:/MyLlamaApp
|
cmake --install build --prefix C:/MyLlamaApp
|
||||||
```
|
```
|
||||||
|
|
|
@ -159,6 +159,7 @@
|
||||||
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||||
|
opencl = config.packages.default.override { useOpenCL = true; };
|
||||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||||
|
|
||||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
// An interface allowing to compute ggml_cgraph with Metal
|
// An interface allowing to compute ggml_cgraph with Metal
|
||||||
//
|
//
|
||||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
||||||
//
|
//
|
||||||
// How it works?
|
// How it works?
|
||||||
//
|
//
|
||||||
|
|
36
ggml/include/ggml-opencl.h
Normal file
36
ggml/include/ggml-opencl.h
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GGML_API void ggml_cl_init(void);
|
||||||
|
|
||||||
|
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
|
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
|
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||||
|
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
|
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||||
|
|
||||||
|
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||||
|
// GGML_API void ggml_cl_host_free(void * ptr);
|
||||||
|
|
||||||
|
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||||
|
|
||||||
|
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
|
||||||
|
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||||
|
|
||||||
|
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||||
|
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -2397,6 +2397,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_API int ggml_cpu_has_blas (void);
|
GGML_API int ggml_cpu_has_blas (void);
|
||||||
GGML_API int ggml_cpu_has_cuda (void);
|
GGML_API int ggml_cpu_has_cuda (void);
|
||||||
|
GGML_API int ggml_cpu_has_clblast (void);
|
||||||
GGML_API int ggml_cpu_has_vulkan (void);
|
GGML_API int ggml_cpu_has_vulkan (void);
|
||||||
GGML_API int ggml_cpu_has_kompute (void);
|
GGML_API int ggml_cpu_has_kompute (void);
|
||||||
GGML_API int ggml_cpu_has_gpublas (void);
|
GGML_API int ggml_cpu_has_gpublas (void);
|
||||||
|
|
|
@ -507,6 +507,23 @@ if (GGML_HIPBLAS)
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_CLBLAST)
|
||||||
|
find_package(CLBlast)
|
||||||
|
if (CLBlast_FOUND)
|
||||||
|
message(STATUS "CLBlast found")
|
||||||
|
|
||||||
|
set(GGML_HEADERS_OPENCL ggml-opencl.h)
|
||||||
|
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
|
||||||
|
else()
|
||||||
|
message(WARNING "CLBlast not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
|
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
|
||||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
||||||
|
|
2305
ggml/src/ggml-opencl.cpp
Normal file
2305
ggml/src/ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load diff
157
ggml/src/ggml.c
157
ggml/src/ggml.c
|
@ -378,6 +378,17 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
|
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
|
#if defined(GGML_BLAS_USE_MKL)
|
||||||
|
#include <mkl.h>
|
||||||
|
#else
|
||||||
|
#include <cblas.h>
|
||||||
|
#endif
|
||||||
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
#include "ggml-opencl.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
|
@ -3516,6 +3527,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
ggml_cl_init();
|
||||||
|
#endif
|
||||||
is_first_call = false;
|
is_first_call = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9126,6 +9140,17 @@ static void ggml_compute_forward_add_f32(
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
|
// TODO: OpenCL kernel support full broadcast
|
||||||
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||||
|
if (ith == 0) {
|
||||||
|
ggml_cl_add(src0, src1, dst);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const int nr = ggml_nrows(src0);
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
@ -10190,6 +10215,17 @@ static void ggml_compute_forward_mul_f32(
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||||
|
// TODO: OpenCL kernel support full broadcast
|
||||||
|
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||||
|
if (ith == 0) {
|
||||||
|
ggml_cl_mul(src0, src1, dst);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
const int64_t nr = ggml_nrows(src0);
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
@ -12315,6 +12351,82 @@ static void ggml_compute_forward_mul_mat(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||||
|
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
||||||
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||||
|
const int64_t ne_plane = ne01*ne00;
|
||||||
|
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
||||||
|
UNUSED(desired_wsize);
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
|
if (type != GGML_TYPE_F32) {
|
||||||
|
assert(params->wsize >= desired_wsize);
|
||||||
|
// parallelize by src0 rows
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||||
|
const int64_t i03 = i13/r3;
|
||||||
|
const int64_t i02 = i12/r2;
|
||||||
|
|
||||||
|
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||||
|
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||||
|
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||||
|
|
||||||
|
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||||
|
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// perform sgemm, parallelization controlled by blas lib
|
||||||
|
if (ith != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
//const int64_t tgemm0 = ggml_perf_time_us();
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
const int64_t i03 = i13/r3;
|
||||||
|
const int64_t i02 = i12/r2;
|
||||||
|
|
||||||
|
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||||
|
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
||||||
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
|
if (type != GGML_TYPE_F32) {
|
||||||
|
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||||
|
}
|
||||||
|
|
||||||
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
ne1, ne01, ne10,
|
||||||
|
1.0f, y, ne10,
|
||||||
|
x, ne00,
|
||||||
|
0.0f, d, ne01);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
||||||
|
|
||||||
|
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
// broadcast factors
|
// broadcast factors
|
||||||
const int64_t r2 = ne12 / ne02;
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
@ -12731,6 +12843,22 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
bool use_blas = ggml_is_matrix(src0) &&
|
||||||
|
ggml_is_matrix(src1) &&
|
||||||
|
ggml_is_contiguous(src0) &&
|
||||||
|
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
||||||
|
if (use_blas) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||||
}
|
}
|
||||||
|
@ -18733,6 +18861,22 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||||
{
|
{
|
||||||
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
||||||
|
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||||
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||||
|
// here we need memory for fully dequantized matrix from src0
|
||||||
|
// take into account that src0 can be broadcasted into src1[2,3]
|
||||||
|
cur = ggml_type_size(GGML_TYPE_F32)
|
||||||
|
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
||||||
|
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
if (node->src[1]->type != vec_dot_type) {
|
if (node->src[1]->type != vec_dot_type) {
|
||||||
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
||||||
}
|
}
|
||||||
|
@ -22022,7 +22166,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -22037,6 +22181,14 @@ int ggml_cpu_has_cuda(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_clblast(void) {
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_vulkan(void) {
|
int ggml_cpu_has_vulkan(void) {
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_VULKAN)
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -22086,7 +22238,8 @@ int ggml_cpu_has_llamafile(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_gpublas(void) {
|
int ggml_cpu_has_gpublas(void) {
|
||||||
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||||
|
ggml_cpu_has_sycl();
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_sse3(void) {
|
int ggml_cpu_has_sse3(void) {
|
||||||
|
|
|
@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
|
||||||
|
|
||||||
# Properties by which to differentiate results per commit:
|
# Properties by which to differentiate results per commit:
|
||||||
KEY_PROPERTIES = [
|
KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||||
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Properties that are boolean and are converted to Yes/No for the table:
|
# Properties that are boolean and are converted to Yes/No for the table:
|
||||||
BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||||
|
|
||||||
# Header names for the table:
|
# Header names for the table:
|
||||||
PRETTY_NAMES = {
|
PRETTY_NAMES = {
|
||||||
"cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
"cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||||
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||||
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Helper script for deploying llama.cpp server with a single Bash command
|
# Helper script for deploying llama.cpp server with a single Bash command
|
||||||
#
|
#
|
||||||
# - Works on Linux and macOS
|
# - Works on Linux and macOS
|
||||||
# - Supports: CPU, CUDA, Metal
|
# - Supports: CPU, CUDA, Metal, OpenCL
|
||||||
# - Can run all GGUF models from HuggingFace
|
# - Can run all GGUF models from HuggingFace
|
||||||
# - Can serve requests in parallel
|
# - Can serve requests in parallel
|
||||||
# - Always builds latest llama.cpp from GitHub
|
# - Always builds latest llama.cpp from GitHub
|
||||||
|
@ -19,7 +19,7 @@
|
||||||
# --port: port number, default is 8888
|
# --port: port number, default is 8888
|
||||||
# --repo: path to a repo containing GGUF model files
|
# --repo: path to a repo containing GGUF model files
|
||||||
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||||
# --backend: cpu, cuda, metal, depends on the OS
|
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||||
# --gpu-id: gpu id, default is 0
|
# --gpu-id: gpu id, default is 0
|
||||||
# --n-parallel: number of parallel requests, default is 8
|
# --n-parallel: number of parallel requests, default is 8
|
||||||
# --n-kv: KV cache size, default is 4096
|
# --n-kv: KV cache size, default is 4096
|
||||||
|
@ -72,7 +72,7 @@ function print_usage {
|
||||||
printf " --port: port number, default is 8888\n"
|
printf " --port: port number, default is 8888\n"
|
||||||
printf " --repo: path to a repo containing GGUF model files\n"
|
printf " --repo: path to a repo containing GGUF model files\n"
|
||||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||||
printf " --backend: cpu, cuda, metal, depends on the OS\n"
|
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||||
printf " --gpu-id: gpu id, default is 0\n"
|
printf " --gpu-id: gpu id, default is 0\n"
|
||||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||||
printf " --n-kv: KV cache size, default is 4096\n"
|
printf " --n-kv: KV cache size, default is 4096\n"
|
||||||
|
@ -387,6 +387,9 @@ elif [[ "$backend" == "cpu" ]]; then
|
||||||
elif [[ "$backend" == "metal" ]]; then
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
printf "[+] Building with Metal backend\n"
|
printf "[+] Building with Metal backend\n"
|
||||||
make -j llama-server $log
|
make -j llama-server $log
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
printf "[+] Building with OpenCL backend\n"
|
||||||
|
LLAMA_CLBLAST=1 make -j llama-server $log
|
||||||
else
|
else
|
||||||
printf "[-] Unknown backend: %s\n" "$backend"
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -404,6 +407,8 @@ elif [[ "$backend" == "cpu" ]]; then
|
||||||
args="-ngl 0"
|
args="-ngl 0"
|
||||||
elif [[ "$backend" == "metal" ]]; then
|
elif [[ "$backend" == "metal" ]]; then
|
||||||
args="-ngl 999"
|
args="-ngl 999"
|
||||||
|
elif [[ "$backend" == "opencl" ]]; then
|
||||||
|
args="-ngl 999"
|
||||||
else
|
else
|
||||||
printf "[-] Unknown backend: %s\n" "$backend"
|
printf "[-] Unknown backend: %s\n" "$backend"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -117,6 +117,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# src/ggml-impl.h -> ggml/src/ggml-impl.h
|
# src/ggml-impl.h -> ggml/src/ggml-impl.h
|
||||||
# src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp
|
# src/ggml-kompute.cpp -> ggml/src/ggml-kompute.cpp
|
||||||
# src/ggml-metal.m -> ggml/src/ggml-metal.m
|
# src/ggml-metal.m -> ggml/src/ggml-metal.m
|
||||||
|
# src/ggml-opencl.cpp -> ggml/src/ggml-opencl.cpp
|
||||||
|
# src/ggml-opencl.h -> ggml/src/ggml-opencl.h
|
||||||
# src/ggml-quants.c -> ggml/src/ggml-quants.c
|
# src/ggml-quants.c -> ggml/src/ggml-quants.c
|
||||||
# src/ggml-quants.h -> ggml/src/ggml-quants.h
|
# src/ggml-quants.h -> ggml/src/ggml-quants.h
|
||||||
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
|
# src/ggml-rpc.cpp -> ggml/src/ggml-rpc.cpp
|
||||||
|
@ -164,6 +166,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
|
||||||
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-opencl\.cpp/\1ggml\/src\/ggml-opencl.cpp/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
|
-e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
|
||||||
|
@ -179,6 +182,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
|
||||||
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-opencl\.h/\1ggml\/include\/ggml-opencl.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
|
||||||
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
|
-e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
|
||||||
|
|
|
@ -19,6 +19,7 @@ cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h
|
||||||
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp
|
cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml/src/ggml-kompute.cpp
|
||||||
cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m
|
cp -rpv ../ggml/src/ggml-metal.m ./ggml/src/ggml-metal.m
|
||||||
cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml/src/ggml-metal.metal
|
||||||
|
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml/src/ggml-opencl.cpp
|
||||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
|
cp -rpv ../ggml/src/ggml-quants.c ./ggml/src/ggml-quants.c
|
||||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
|
cp -rpv ../ggml/src/ggml-quants.h ./ggml/src/ggml-quants.h
|
||||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
|
||||||
|
@ -35,6 +36,7 @@ cp -rpv ../ggml/include/ggml-cann.h ./ggml/include/ggml-cann.h
|
||||||
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
|
cp -rpv ../ggml/include/ggml-cuda.h ./ggml/include/ggml-cuda.h
|
||||||
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
|
cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
|
||||||
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
|
cp -rpv ../ggml/include/ggml-metal.h ./ggml/include/ggml-metal.h
|
||||||
|
cp -rpv ../ggml/include/ggml-opencl.h ./ggml/include/ggml-opencl.h
|
||||||
cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h
|
cp -rpv ../ggml/include/ggml-rpc.h ./ggml/include/ggml-rpc.h
|
||||||
cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h
|
cp -rpv ../ggml/include/ggml-sycl.h ./ggml/include/ggml-sycl.h
|
||||||
cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h
|
cp -rpv ../ggml/include/ggml-vulkan.h ./ggml/include/ggml-vulkan.h
|
||||||
|
|
|
@ -15,6 +15,8 @@
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
# include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
# include "ggml-vulkan.h"
|
# include "ggml-vulkan.h"
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
@ -2840,6 +2842,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
buft = ggml_backend_vk_buffer_type(gpu);
|
buft = ggml_backend_vk_buffer_type(gpu);
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||||
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
buft = ggml_backend_opencl_buffer_type();
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
|
@ -2960,6 +2964,10 @@ static bool llama_kv_cache_init(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
offload = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
// count used buffer types
|
// count used buffer types
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||||
if (offload) {
|
if (offload) {
|
||||||
|
@ -16507,7 +16515,7 @@ bool llama_supports_mlock(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue