Revert "ggml : remove OpenCL (#7735) + (#8235)"

Manually adjusted. This reverts commit 554c247caf. This reverts commit 257f8e41e2. Signed-off-by: David Heidelberg <david@ixit.cz>
2024-08-11 23:50:26 +09:00 · 2024-08-11 23:50:26 +09:00 · 346f64f0d8
commit 346f64f0d8
parent 8cd1bcfd3f
23 changed files with 2645 additions and 25 deletions
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -18,16 +18,19 @@
  vulkan-headers,
  vulkan-loader,
  curl,
  clblast,
  shaderc,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
@ -56,6 +59,7 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];
@ -207,6 +211,7 @@ effectiveStdenv.mkDerivation (
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
      ++ optionals useVulkan vulkanBuildInputs
@ -220,6 +225,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_CURL" enableCurl)
        (cmakeBool "GGML_NATIVE" false)
        (cmakeBool "GGML_BLAS" useBlas)
        (cmakeBool "GGML_CLBLAST" useOpenCL)
        (cmakeBool "GGML_CUDA" useCuda)
        (cmakeBool "GGML_HIPBLAS" useRocm)
        (cmakeBool "GGML_METAL" useMetalKit)
@ -263,6 +269,7 @@ effectiveStdenv.mkDerivation (
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        useVulkan
        ;
@ -289,7 +296,7 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals useCuda lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -690,6 +690,8 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1
@ -706,6 +708,8 @@ jobs:
            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast-x64'
            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas-x64'
            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
@ -730,6 +734,27 @@ jobs:
        run: |
          git submodule update --init ggml/src/kompute
      - name: Download OpenCL SDK
        id: get_opencl
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
          mkdir $env:RUNNER_TEMP/opencl
          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
      - name: Download CLBlast
        id: get_clblast
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
            $txt = Get-Content -Path $f -Raw
            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
          }
      - name: Download OpenBLAS
        id: get_openblas
        if: ${{ matrix.build == 'openblas-x64' }}
@ -763,6 +788,13 @@ jobs:
          cmake -S . -B build ${{ matrix.defines }}
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Add clblast.dll
        id: add_clblast_dll
        if: ${{ matrix.build == 'clblast-x64' }}
        run: |
          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
      - name: Add libopenblas.dll
        id: add_libopenblas_dll
        if: ${{ matrix.build == 'openblas-x64' }}
@ -786,7 +818,7 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900
@ -1044,7 +1076,7 @@ jobs:
 #        hypervisor: 'qemu'
 #        run: |
 #            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
 #            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
  release:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -42,6 +42,10 @@ endif()
 option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(GGML_CLBLAST                         "llama: use CLBlast"                               OFF)
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
@ -102,6 +106,7 @@ llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
 llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_CLBLAST             GGML_CLBLAST)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
@ -165,6 +170,16 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 install(
    FILES convert_hf_to_gguf.py
    PERMISSIONS
--- a/17
+++ b/17
@ -746,6 +746,23 @@ ggml/src/ggml-cuda.o: \
 	$(NVCC_COMPILE)
 endif # GGML_CUDA
 ifdef LLAMA_CLBLAST
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		MK_LDFLAGS += -lclblast -framework OpenCL
 	else
 		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef GGML_VULKAN
 	MK_CPPFLAGS += -DGGML_USE_VULKAN
 	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
--- a/README.md
+++ b/README.md
@ -49,7 +49,7 @@ variety of hardware - locally and in the cloud.
 - AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan and SYCL backend support
+- Vulkan, SYCL, and (partial) OpenCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -6,6 +6,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
 set(GGML_CLBLAST    @GGML_CLBLAST@)
 set(GGML_HIPBLAS    @GGML_HIPBLAS@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
 set(GGML_VULKAN @GGML_VULKAN@)
@ -44,6 +45,11 @@ if (GGML_METAL)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 if (GGML_CLBLAST)
    find_package(CLBlast REQUIRED)
 endif()
 if (GGML_VULKAN)
    find_package(Vulkan REQUIRED)
 endif()
--- a/common/common.cpp
+++ b/common/common.cpp
@ -3128,6 +3128,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -30,7 +30,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
 When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
 ## Recommended Release
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -162,7 +162,7 @@ $ ./llama-bench -o csv
 ```
 ```csv
-build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@ -179,6 +179,7 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@ -209,6 +210,7 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@ -251,6 +253,7 @@ CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
  cuda INTEGER,
  opencl INTEGER,
  metal INTEGER,
  gpu_blas INTEGER,
  blas INTEGER,
@ -276,6 +279,6 @@ CREATE TABLE IF NOT EXISTS test (
  stddev_ts REAL
 );
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -754,6 +754,7 @@ struct test {
    static const std::string build_commit;
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
    static const bool vulkan;
    static const bool kompute;
    static const bool metal;
@ -843,6 +844,9 @@ struct test {
        if (cuda) {
            return GGML_CUDA_NAME;
        }
        if (opencl) {
            return "OpenCL";
        }
        if (vulkan) {
            return "Vulkan";
        }
@ -868,7 +872,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@ -894,7 +898,7 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
+        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
@ -924,7 +928,7 @@ struct test {
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(opencl), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@ -952,6 +956,7 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@ -8,14 +8,16 @@ Because this example is "outside of the source tree", it is important to first b
 ### Considerations
-When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
 ### Build llama.cpp and install to C:\LlamaCPP directory
 In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
 ```cmd
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/LlamaCPP
 ```
@ -25,7 +27,7 @@ cmake --install build --prefix C:/LlamaCPP
 ```cmd
 cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/MyLlamaApp
 ```
--- a/flake.nix
+++ b/flake.nix
@ -159,6 +159,7 @@
                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
                mpi-cpu = config.packages.default.override { useMpi = true; };
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -1,7 +1,7 @@
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
 //
 // How it works?
 //
--- a/ggml/include/ggml-opencl.h
+++ b/ggml/include/ggml-opencl.h
@ -0,0 +1,36 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 GGML_API void ggml_cl_init(void);
 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 // GGML_API void * ggml_cl_host_malloc(size_t size);
 // GGML_API void   ggml_cl_host_free(void * ptr);
 GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
 GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 // backend API
 // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
 // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
 // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2397,6 +2397,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cuda       (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_vulkan     (void);
    GGML_API int ggml_cpu_has_kompute    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -507,6 +507,23 @@ if (GGML_HIPBLAS)
    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (GGML_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")
        set(GGML_HEADERS_OPENCL ggml-opencl.h)
        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
        add_compile_definitions(GGML_USE_CLBLAST)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
    else()
        message(WARNING "CLBlast not found")
    endif()
 endif()
 if (GGML_SYCL)
    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
--- a/ggml/src/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl.cpp
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -378,6 +378,17 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
 #include <mkl.h>
 #else
 #include <cblas.h>
 #endif
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
 #endif
 // floating point type used to accumulate sums
@ -3516,6 +3527,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }
 #if defined(GGML_USE_CLBLAST)
        ggml_cl_init();
 #endif
        is_first_call = false;
    }
@ -9126,6 +9140,17 @@ static void ggml_compute_forward_add_f32(
    const int ith = params->ith;
    const int nth = params->nth;
 #ifdef GGML_USE_CLBLAST
    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
        if (ith == 0) {
            ggml_cl_add(src0, src1, dst);
        }
        return;
    }
 #endif
    const int nr  = ggml_nrows(src0);
    GGML_TENSOR_BINARY_OP_LOCALS
@ -10190,6 +10215,17 @@ static void ggml_compute_forward_mul_f32(
    const int ith = params->ith;
    const int nth = params->nth;
 #if defined(GGML_USE_CLBLAST)
    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
        if (ith == 0) {
            ggml_cl_mul(src0, src1, dst);
        }
        return;
    }
 #endif
    const int64_t nr = ggml_nrows(src0);
    GGML_TENSOR_BINARY_OP_LOCALS
@ -12315,6 +12351,82 @@ static void ggml_compute_forward_mul_mat(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
 #if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
        }
        return;
    }
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
        const int64_t ne_plane      = ne01*ne00;
        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
        UNUSED(desired_wsize);
        if (params->type == GGML_TASK_TYPE_INIT) {
            if (type != GGML_TYPE_F32) {
                assert(params->wsize >= desired_wsize);
                // parallelize by src0 rows
                for (int64_t i13 = 0; i13 < ne13; i13++) {
                    for (int64_t i12 = 0; i12 < ne12; i12++) {
                        // broadcast src0 into src1 across 2nd,3rd dimension
                        const int64_t i03 = i13/r3;
                        const int64_t i02 = i12/r2;
                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                              ggml_to_float_t  const to_float = type_traits[type].to_float;
                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
                        }
                    }
                }
            }
            return;
        }
        if (params->type == GGML_TASK_TYPE_FINALIZE) {
            return;
        }
        // perform sgemm, parallelization controlled by blas lib
        if (ith != 0) {
            return;
        }
        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;
                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
                if (type != GGML_TYPE_F32) {
                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                }
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                          ne1, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
            }
        }
        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
        return;
    }
 #endif
 #if GGML_USE_LLAMAFILE
    // broadcast factors
    const int64_t r2 = ne12 / ne02;
@ -12731,6 +12843,22 @@ static void ggml_compute_forward_out_prod_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
    // TODO: #if defined(GGML_USE_CLBLAST)
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    bool use_blas = ggml_is_matrix(src0) &&
        ggml_is_matrix(src1) &&
        ggml_is_contiguous(src0) &&
        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
 #endif
    if (params->type == GGML_TASK_TYPE_INIT) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
        if (use_blas) {
            return;
        }
 #endif
    if (ith == 0) {
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
    }
@ -18733,6 +18861,22 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                {
                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 #if defined(GGML_USE_CLBLAST)
                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
                    } else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory for fully dequantized matrix from src0
                            // take into account that src0 can be broadcasted into src1[2,3]
                            cur = ggml_type_size(GGML_TYPE_F32)
                                * node->src[0]->ne[0]*node->src[0]->ne[1]
                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                        }
                    } else
 #endif
                    if (node->src[1]->type != vec_dot_type) {
                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
                    }
@ -22022,7 +22166,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
@ -22037,6 +22181,14 @@ int ggml_cpu_has_cuda(void) {
 #endif
 }
 int ggml_cpu_has_clblast(void) {
 #if defined(GGML_USE_CLBLAST)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_vulkan(void) {
 #if defined(GGML_USE_VULKAN)
    return 1;
@ -22086,7 +22238,8 @@ int ggml_cpu_has_llamafile(void) {
 }
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
+    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
           ggml_cpu_has_sycl();
 }
 int ggml_cpu_has_sse3(void) {
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
+    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
    "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
    "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 # Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
+BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
 # Header names for the table:
 PRETTY_NAMES = {
-    "cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
+    "cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
    "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
    "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
    "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@ -3,7 +3,7 @@
 # Helper script for deploying llama.cpp server with a single Bash command
 #
 # - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal
+# - Supports: CPU, CUDA, Metal, OpenCL
 # - Can run all GGUF models from HuggingFace
 # - Can serve requests in parallel
 # - Always builds latest llama.cpp from GitHub
@ -19,7 +19,7 @@
 #   --port:            port number, default is 8888
 #   --repo:            path to a repo containing GGUF model files
 #   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:         cpu, cuda, metal, depends on the OS
+#   --backend:         cpu, cuda, metal, opencl, depends on the OS
 #   --gpu-id:          gpu id, default is 0
 #   --n-parallel:      number of parallel requests, default is 8
 #   --n-kv:            KV cache size, default is 4096
@ -72,7 +72,7 @@ function print_usage {
    printf "  --port:             port number, default is 8888\n"
    printf "  --repo:             path to a repo containing GGUF model files\n"
    printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
+    printf "  --backend:          cpu, cuda, metal, opencl, depends on the OS\n"
    printf "  --gpu-id:           gpu id, default is 0\n"
    printf "  --n-parallel:       number of parallel requests, default is 8\n"
    printf "  --n-kv:             KV cache size, default is 4096\n"
@ -387,6 +387,9 @@ elif [[ "$backend" == "cpu" ]]; then
 elif [[ "$backend" == "metal" ]]; then
    printf "[+] Building with Metal backend\n"
    make -j llama-server $log
 elif [[ "$backend" == "opencl" ]]; then
    printf "[+] Building with OpenCL backend\n"
    LLAMA_CLBLAST=1 make -j llama-server $log
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
@ -404,6 +407,8 @@ elif [[ "$backend" == "cpu" ]]; then
    args="-ngl 0"
 elif [[ "$backend" == "metal" ]]; then
    args="-ngl 999"
 elif [[ "$backend" == "opencl" ]]; then
    args="-ngl 999"
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -117,6 +117,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-impl.h         -> ggml/src/ggml-impl.h
    # src/ggml-kompute.cpp    -> ggml/src/ggml-kompute.cpp
    # src/ggml-metal.m        -> ggml/src/ggml-metal.m
    # src/ggml-opencl.cpp     -> ggml/src/ggml-opencl.cpp
    # src/ggml-opencl.h       -> ggml/src/ggml-opencl.h
    # src/ggml-quants.c       -> ggml/src/ggml-quants.c
    # src/ggml-quants.h       -> ggml/src/ggml-quants.h
    # src/ggml-rpc.cpp        -> ggml/src/ggml-rpc.cpp
@ -164,6 +166,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-opencl\.cpp/\1ggml\/src\/ggml-opencl.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \
@ -179,6 +182,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-opencl\.h/\1ggml\/include\/ggml-opencl.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -19,6 +19,7 @@ cp -rpv ../ggml/src/ggml-impl.h         ./ggml/src/ggml-impl.h
 cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml/src/ggml-kompute.cpp
 cp -rpv ../ggml/src/ggml-metal.m        ./ggml/src/ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal    ./ggml/src/ggml-metal.metal
 cp -rpv ../ggml/src/ggml-opencl.cpp     ./ggml/src/ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-quants.c       ./ggml/src/ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h       ./ggml/src/ggml-quants.h
 cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
@ -35,6 +36,7 @@ cp -rpv ../ggml/include/ggml-cann.h    ./ggml/include/ggml-cann.h
 cp -rpv ../ggml/include/ggml-cuda.h    ./ggml/include/ggml-cuda.h
 cp -rpv ../ggml/include/ggml-kompute.h ./ggml/include/ggml-kompute.h
 cp -rpv ../ggml/include/ggml-metal.h   ./ggml/include/ggml-metal.h
 cp -rpv ../ggml/include/ggml-opencl.h  ./ggml/include/ggml-opencl.h
 cp -rpv ../ggml/include/ggml-rpc.h     ./ggml/include/ggml-rpc.h
 cp -rpv ../ggml/include/ggml-sycl.h    ./ggml/include/ggml-sycl.h
 cp -rpv ../ggml/include/ggml-vulkan.h  ./ggml/include/ggml-vulkan.h
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -15,6 +15,8 @@
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
 #elif defined(GGML_USE_VULKAN)
 #  include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
@ -2840,6 +2842,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
    buft = ggml_backend_vk_buffer_type(gpu);
 #elif defined(GGML_USE_SYCL)
    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
    buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
    buft = ggml_backend_kompute_buffer_type(gpu);
    if (buft == nullptr) {
@ -2960,6 +2964,10 @@ static bool llama_kv_cache_init(
        }
    }
 #ifdef GGML_USE_CLBLAST
    offload = false;
 #endif
    // count used buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    if (offload) {
@ -16507,7 +16515,7 @@ bool llama_supports_mlock(void) {
 }
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)   || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;