Merge branch 'master' into sycl_fix_non_intel_fp16

2024-04-03 12:57:32 +01:00 · 2024-04-03 12:57:32 +01:00 · c0c4b309d7
commit c0c4b309d7
parent 84ef62e8d5 52604860f9
28 changed files with 26561 additions and 3687 deletions
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -25,7 +25,7 @@ on:
    branches:
      - master
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
-  pull_request:
+  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  schedule:
@ -143,7 +143,6 @@ jobs:

      - name: Commit status
        uses: Sibz/github-status-action@v1
-        continue-on-error: true # If not authorized on external repo
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -4,6 +4,10 @@ name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
@ -11,12 +15,12 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
-  pull_request:
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  schedule:
-    -  cron: '0 0 * * *'
+    -  cron: '2 4 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
@ -44,25 +48,45 @@ jobs:
      options: --cpus 4

    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get -y install \
            build-essential \
+            xxd \
            git \
            cmake \
            python3-pip \
+            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev

+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Verify server deps
+        id: verify_server_deps
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd examples/server
+          git ls-files --others --modified
+          git status
+          ./deps.sh
+          git status
+          not_ignored_files="$(git ls-files --others --modified)"
+          echo "Modified files: ${not_ignored_files}"
+          if [ -n "${not_ignored_files}" ]; then
+            echo "Repository is dirty or server deps are not built as expected"
+            echo "${not_ignored_files}"
+            exit 1
+          fi
+
      - name: Build
        id: cmake_build
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -113,6 +113,9 @@ option(LLAMA_METAL                           "llama: use Metal"
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
+set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                             "llama: metal minimum macOS version")
+set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@ -250,6 +253,16 @@ if (LLAMA_METAL)
            set(XC_FLAGS -O3)
        endif()

+        # Append macOS metal versioning flags
+        if (LLAMA_METAL_MACOSX_VERSION_MIN)
+            message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+            list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
+        endif()
+        if (LLAMA_METAL_STD)
+            message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
+            list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
+        endif()
+
        add_custom_command(
            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
--- a/README-sycl.md
+++ b/README-sycl.md
@ -3,7 +3,7 @@
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
- [Intel GPU](#intel-gpu)
+- [Supported Devices](#supported-devices)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@ -14,17 +14,25 @@

 ## Background

-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.

-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
+- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
+- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

-To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+### Llama.cpp + SYCL
+This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

-The llama.cpp for SYCL is used to support Intel GPUs.
+The llama.cpp SYCL backend supports:
+- Intel GPUs.
+- Nvidia GPUs.

-For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+*Upcoming support: AMD GPUs*.
+
+When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64](README.md#intel-onemkl) approach.

 ## News

@ -51,9 +59,16 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 |Windows|Support|Windows 11|


-## Intel GPU
+## Supported devices

-### Verified
+### Intel GPUs
+
+The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
+```sh
+source /opt/intel/oneapi/setvars.sh
+```
+
+- **Tested devices**

 |Intel GPU| Status | Verified Model|
 |-|-|-|
@ -63,198 +78,229 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|

-Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
+*Notes:*

-### Memory
+- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.

-The memory is a limitation to run LLM on GPUs.
+- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.

-When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors:            buffer size =  3577.56 MiB`.
+- If the iGPU has less than 80  EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.

-For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
+### Nvidia GPUs
+The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)

-For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
+- **Tested devices**

-## Nvidia GPU
-
-### Verified
-
-|Intel GPU| Status | Verified Model|
+|Nvidia GPU| Status | Verified Model|
 |-|-|-|
-|Ampere Series| Support| A100|
+|Ampere Series| Support| A100, A4000|
+|Ampere Series *(Mobile)*| Support| RTX 40 Series|

-### oneMKL for CUDA
+*Notes:*
+  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.

-The current oneMKL release does not contain the oneMKL cuBlas backend.
-As a result for Nvidia GPU's oneMKL must be built from source.
+  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.

-```
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-mkdir build
-cd build
-cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
-ninja
-// Add paths as necessary
-```

 ## Docker
-
-Note:
- Only docker on Linux is tested. Docker on WSL may not work.
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
-
-### Build the image
-
-You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
-
-
+The docker build option is currently limited to *intel GPU* targets.
+### Build image
 ```sh
-# For F16:
-#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
-
-# Or, for F32:
-docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
-
-# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
+# Using FP16
+docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
 ```

-### Run
+*Notes*:
+
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
+
+You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
+
+### Run container

 ```sh
-# Firstly, find all the DRI cards:
+# First, find all the DRI cards
 ls -la /dev/dri
-# Then, pick the card that you want to use.
-
-# For example with "/dev/dri/card1"
+# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

+*Notes:*
+- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
+- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
+
 ## Linux

-### Setup Environment
+### I. Setup Environment

-1. Install Intel GPU driver.
+1. **Install GPU drivers**

-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+  - **Intel GPU**

-Note: for iGPU, please install the client GPU driver.
+Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).

-b. Add user to group: video, render.
+*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
+
+Once installed, add the user(s) to the `video` and `render` groups.

 ```sh
-sudo usermod -aG render username
-sudo usermod -aG video username
+sudo usermod -aG render $USER
+sudo usermod -aG video $USER
 ```

-Note: re-login to enable it.
+*Note*: logout/re-login for the changes to take effect.

-c. Check
+Verify installation through `clinfo`:

 ```sh
 sudo apt install clinfo
 sudo clinfo -l
 ```

-Output (example):
+Sample output:

-```
+```sh
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics

-
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```

-2. Install Intel® oneAPI Base toolkit.
+- **Nvidia GPU**

-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
+Installation can be verified by running the following:
+```sh
+nvidia-smi
+```
+Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
+```
+---------------------------------------------------------------------------------------+
+| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
+|-----------------------------------------+----------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+|                                         |                      |               MIG M. |
+|=========================================+======================+======================|
+|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
+| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
+|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
+```

-Recommend to install to default folder: **/opt/intel/oneapi**.

-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+2. **Install Intel® oneAPI Base toolkit**

-b. Check
+- **Base installation**
+
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+
+Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
+
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
+
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
+
+- **Adding support to Nvidia GPUs**
+
+**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+
+
+**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
-source /opt/intel/oneapi/setvars.sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+mkdir -p buildWithCublas && cd buildWithCublas
+cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+make
+```

+
+3. **Verify installation and environment**
+
+In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
+```sh
+source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```

-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
+- **Intel GPU**
+
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:

-Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
 ```

-2. Build locally:
+- **Nvidia GPU**

-Note:
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
+[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
+[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
+```

+### II. Build llama.cpp
+
+#### Intel GPU
 ```sh
-mkdir -p build
-cd build
+# Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

-# For FP16:
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Build LLAMA with MKL BLAS acceleration for intel GPU
+mkdir -p build && cd build

-# Or, for FP32:
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# Option 2: Use FP32 by default
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# For Nvidia GPUs
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Build example/main only
-#cmake --build . --config Release --target main
-
-# Or, build all binary
-cmake --build . --config Release -v
-
-cd ..
 ```

-or
-
+#### Nvidia GPU
 ```sh
-./examples/sycl/build.sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+
+# Build LLAMA with Nvidia BLAS acceleration through SYCL
+mkdir -p build && cd build
+
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# Option 2: Use FP32 by default
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

-### Run
+### III. Run the inference

-1. Put model file to folder **models**
+1. Retrieve and prepare model

-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 2. Enable oneAPI running environment

-```
+```sh
 source /opt/intel/oneapi/setvars.sh
 ```

-3. List device ID
+3. List devices information

-Run without parameter:
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```sh
 ./build/bin/ls-sycl-device
-
-# or running the "main" executable and look at the output log:
-
-./build/bin/main
 ```
-
-Check the ID in startup log, like:
-
+A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@ -270,15 +316,15 @@ found 6 SYCL devices:

 |Attribute|Note|
 |-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+|compute capability 1.3|Level-zero driver/runtime, recommended |
+|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|

-4. Device selection and execution of llama.cpp
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device target specified by the user.
+- Multiple devices: Automatically select the devices with the same largest Max compute-units.

 |Device selection|Parameter|
 |-|-|
@ -303,74 +349,64 @@ or run by script:
 ```sh
 ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
-or run by script:
+
+Otherwise, you can run the script:

 ```sh
 ./examples/sycl/run_llama2.sh
 ```

-Note:
+*Notes:*

- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

-
-5. Verify the device ID in output
-
-Verify to see if the selected GPU is shown in the output, like:
-
-```
+```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```
+```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

-
 ## Windows

-### Setup Environment
+### I. Setup Environment

-1. Install Intel GPU driver.
+1. Install GPU driver

-Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

-Note: **The driver is mandatory for compute function**.
+2. Install Visual Studio

-2. Install Visual Studio.
+If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).

-Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
+3. Install Intel® oneAPI Base toolkit

-3. Install Intel® oneAPI Base toolkit.
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.

-Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
-
-Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
+Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

 b. Enable oneAPI running environment:

- In Search, input 'oneAPI'.
+- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.

-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
- In Run:
-
-In CMD:
+- On the command prompt, enable the runtime environment with the following:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-c. Check GPU
+c. Verify installation

-In oneAPI command line:
+In the oneAPI command line, run the following to print the available SYCL devices:

 ```
 sycl-ls
 ```

-There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
+There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:

 Output (example):
 ```
@ -380,7 +416,7 @@ Output (example):
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```

-4. Install cmake & make
+4. Install build tools

 a. Download & install cmake for Windows: https://cmake.org/download/

@ -390,76 +426,53 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit

 - Extract `w64devkit` on your pc.

- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
+- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).

-### Build locally:
+### II. Build llama.cpp

-In oneAPI command line window:
+On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
 mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-::  for FP16
-::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

-::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
-::  build example/main only
-::  make main
-
-::  build all binary
-make -j
-cd ..
+make
 ```

-or
-
-```
+Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
+```sh
 .\examples\sycl\win-build-sycl.bat
 ```

-Note:
+*Notes:*

- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.

-### Run
+### III. Run the inference

-1. Put model file to folder **models**
+1. Retrieve and prepare model

-You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

 2. Enable oneAPI running environment

- In Search, input 'oneAPI'.
-
-Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
-
- In Run:
-
-In CMD:
+On the oneAPI command line window, run the following and step into the llama.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-3. List device ID
+3. List devices information

-Run without parameter:
+Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```
 build\bin\ls-sycl-device.exe
-
-or
-
-build\bin\main.exe
 ```

-Check the ID in startup log, like:
-
+The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@ -480,7 +493,7 @@ found 6 SYCL devices:
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|


-4. Device selection and execution of llama.cpp
+4. Launch inference

 There are two device selection modes:

@ -505,7 +518,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be
 ```
 build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
-or run by script:
+Otherwise, run the following wrapper script:

 ```
 .\examples\sycl\win-run-llama2.bat
@ -513,19 +526,14 @@ or run by script:

 Note:

- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
+- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

-
-
-5. Verify the device ID in output
-
-Verify to see if the selected GPU is shown in the output, like:
-
-```
+```sh
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```
+```sh
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

@ -535,64 +543,54 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 |Name|Value|Function|
 |-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
-|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
-
-#### Running
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
+|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
+|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
+|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
+|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|

+#### Runtime

 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|

-## Known Issue
+## Known Issues

- Hang during startup
+- Hanging during startup

-  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.

-  Solution: add **--no-mmap** or **--mmap 0**.
+  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.

- Split-mode: [row] is not supported
-
-  It's on developing.
+- `Split-mode:[row]` is not supported.

 ## Q&A

-Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
-
-
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

-  Miss to enable oneAPI running environment.
+  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
+  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.

-  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+- General compiler error:

- In Windows, no result, not error.
+  - Remove build folder or try a clean-build.

-  Miss to enable oneAPI running environment.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

- Meet compile error.
+  Please double-check with `sudo sycl-ls`.

-  Remove folder **build** and try again.
-
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
-
-  Please run **sudo sycl-ls**.
-
-  If you see it in result, please add video/render group to your ID:
+  If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:

  ```
-  sudo usermod -aG render username
-  sudo usermod -aG video username
+  sudo usermod -aG render $USER
+  sudo usermod -aG video $USER
  ```
+  Otherwise, please double-check the GPU driver installation steps.

-  Then **relogin**.
-
-  If you do not see it, please check the installation GPU steps again.
+### **GitHub contribution**:
+Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

 ## Todo

--- a/README.md
+++ b/README.md
@ -18,12 +18,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

+- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
 - Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

 ----

@ -115,6 +115,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Xverse](https://huggingface.co/models?search=xverse)
 - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)

 **Multimodal models:**
@ -175,6 +176,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
+- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
+
+*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 ---

@ -632,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements

 - #### Vulkan

-> [!WARNING]
->
-> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
-> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
-> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
-> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
->
-> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
-
  **With docker**:

  You don't need to install Vulkan SDK. It will be installed inside the container.
--- a/ci/run.sh
+++ b/ci/run.sh
@ -575,7 +575,7 @@ function gg_run_embd_bge_small {
    cd ${SRC}

    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
-    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

-from convert import LlamaHfVocab
+from convert import LlamaHfVocab, permute


 ###### MODEL DEFINITIONS ######
@ -773,6 +773,148 @@ class BaichuanModel(Model):
        return weights[r * n_part:r * n_part + r, ...]


+@Model.register("XverseForCausalLM")
+class XverseModel(Model):
+    model_arch = gguf.MODEL_ARCH.XVERSE
+
+    def set_vocab(self):
+        assert (self.dir_model / "tokenizer.json").is_file()
+        dir_model = self.dir_model
+        hparams = self.hparams
+
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model)
+        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+        assert max(tokenizer.vocab.values()) < vocab_size
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for token_id in range(vocab_size):
+            token_text = reverse_vocab[token_id].encode('utf-8')
+            # replace "\x00" to string with length > 0
+            if token_text == b"\x00":
+                toktype = gguf.TokenType.BYTE  # special
+                token_text = f"<{token_text}>".encode('utf-8')
+            elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                toktype = gguf.TokenType.BYTE  # special
+            elif reverse_vocab[token_id] in added_vocab:
+                if tokenizer.added_tokens_decoder[token_id].special:
+                    toktype = gguf.TokenType.CONTROL
+                else:
+                    toktype = gguf.TokenType.USER_DEFINED
+            else:
+                toktype = gguf.TokenType.NORMAL
+
+            tokens.append(token_text)
+            toktypes.append(toktype)
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+        hf_repo = self.hparams.get("_name_or_path", "")
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            print("gguf: can not find ctx length parameter.")
+            sys.exit()
+
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_source_hf_repo(hf_repo)
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def write_tensors(self):
+        # Collect tensors from generator object
+        model_kv = dict(self.get_tensors())
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        for name, data_torch in model_kv.items():
+            # we don't need these
+            if name.endswith(".rotary_emb.inv_freq"):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            # HF models permute some of the tensors, so we need to undo that
+            if name.endswith(("q_proj.weight")):
+                data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
+            if name.endswith(("k_proj.weight")):
+                data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            self.gguf_writer.add_tensor(new_name, data)
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+
@Model.register("FalconForCausalLM", "RWForCausalLM")
 class FalconModel(Model):
    model_arch = gguf.MODEL_ARCH.FALCON
@ -1052,12 +1194,72 @@ class StableLMModel(Model):
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))


-@Model.register("MixtralForCausalLM")
-class MixtralModel(Model):
+@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

    def set_vocab(self):
+        try:
            self. _set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_llama_hf()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+    # Same as super class, but permuting q_proj, k_proj
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.numpy()
+
+            if name.endswith("q_proj.weight"):
+                data = permute(data, n_head, n_head)
+            if name.endswith("k_proj.weight"):
+                data = permute(data, n_head, n_kv_head)
+
+            data = data.squeeze()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)


@Model.register("GrokForCausalLM")
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -28,9 +28,11 @@ enum split_operation : uint8_t {

 struct split_params {
    split_operation operation = SPLIT_OP_SPLIT;
+    size_t n_bytes_split = 0;
    int n_split_tensors = 128;
    std::string input;
    std::string output;
+    bool dry_run = false;
 };

 static void split_print_usage(const char * executable) {
@ -43,13 +45,34 @@ static void split_print_usage(const char * executable) {
    printf("options:\n");
    printf("  -h, --help              show this help message and exit\n");
    printf("  --version               show version and build info\n");
-    printf("  --split               split GGUF to multiple GGUF (default)\n");
-    printf("  --split-max-tensors   max tensors in each split: default(%d)\n", default_params.n_split_tensors);
+    printf("  --split                 split GGUF to multiple GGUF (enabled by default)\n");
    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
+    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
+    printf("  --split-max-size N(M|G) max size per split\n");
+    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
    printf("\n");
 }

-static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
+// return convert string, for example "128M" or "4G" to number of bytes
+static size_t split_str_to_n_bytes(std::string str) {
+    size_t n_bytes = 0;
+    int n;
+    if (str.back() == 'M') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = n * 1024 * 1024; // megabytes
+    } else if (str.back() == 'G') {
+        sscanf(str.c_str(), "%d", &n);
+        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+    } else {
+        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
+    }
+    if (n <= 0) {
+        throw std::invalid_argument("error: size must be a positive value");
+    }
+    return n_bytes;
+}
+
+static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
    std::string arg;
    const std::string arg_prefix = "--";
    bool invalid_param = false;
@ -62,6 +85,8 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
        }

        bool arg_found = false;
+        bool is_op_set = false;
+        bool is_mode_set = false;
        if (arg == "-h" || arg == "--help") {
            split_print_usage(argv[0]);
            exit(0);
@ -71,23 +96,46 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
        }
+        if (arg == "--dry-run") {
+            arg_found = true;
+            params.dry_run = true;
+        }

+        if (is_op_set) {
+            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+        }
        if (arg == "--merge") {
            arg_found = true;
+            is_op_set = true;
            params.operation = SPLIT_OP_MERGE;
        }
        if (arg == "--split") {
            arg_found = true;
+            is_op_set = true;
            params.operation = SPLIT_OP_SPLIT;
        }
+
+        if (is_mode_set) {
+            throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+        }
        if (arg == "--split-max-tensors") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
+            is_mode_set = true;
            params.n_split_tensors = atoi(argv[arg_idx]);
        }
+        if (arg == "--split-max-size") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            is_mode_set = true;
+            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
+        }

        if (!arg_found) {
            throw std::invalid_argument("error: unknown argument: " + arg);
@ -99,24 +147,17 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p
    }

    if (argc - arg_idx < 2) {
-        printf("%s: bad arguments\n", argv[0]);
-        split_print_usage(argv[0]);
-        return false;
+        throw std::invalid_argument("error: bad arguments");
    }

    params.input = argv[arg_idx++];
    params.output = argv[arg_idx++];
-
-    return true;
 }

 static bool split_params_parse(int argc, const char ** argv, split_params & params) {
    bool result = true;
    try {
-        if (!split_params_parse_ex(argc, argv, params)) {
-            split_print_usage(argv[0]);
-            exit(EXIT_FAILURE);
-        }
+        split_params_parse_ex(argc, argv, params);
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
@ -140,15 +181,11 @@ struct split_strategy {
    struct ggml_context * ctx_meta = NULL;
    const int n_tensors;

-    const int n_split;
-    int i_split = 0;
+    // one ctx_out per one output file
+    std::vector<struct gguf_context *> ctx_outs;

-    int i_tensor = 0;
-
-    std::vector<uint8_t> read_data;
-
-    struct gguf_context * ctx_out;
-    std::ofstream fout;
+    // temporary buffer for reading in tensor data
+    std::vector<uint8_t> read_buf;

    split_strategy(const split_params & params,
            std::ifstream & f_input,
@ -158,79 +195,141 @@ struct split_strategy {
        f_input(f_input),
        ctx_gguf(ctx_gguf),
        ctx_meta(ctx_meta),
-        n_tensors(gguf_get_n_tensors(ctx_gguf)),
-        n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
-        }
+        n_tensors(gguf_get_n_tensors(ctx_gguf)) {

-    bool should_split() const {
-        return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
+        int i_split = -1;
+        struct gguf_context * ctx_out = NULL;
+        auto new_ctx_out = [&]() {
+            i_split++;
+            if (ctx_out != NULL) {
+                if (gguf_get_n_tensors(ctx_out) == 0) {
+                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
+                    exit(EXIT_FAILURE);
+                }
+                ctx_outs.push_back(ctx_out);
            }
-
-    void split_start() {
            ctx_out = gguf_init_empty();
-
            // Save all metadata in first split only
            if (i_split == 0) {
                gguf_set_kv(ctx_out, ctx_gguf);
            }
            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
-        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
            gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
+        };

-        // populate the original tensors, so we get an initial metadata
-        for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
-            struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            gguf_add_tensor(ctx_out, meta);
+        // initialize ctx_out for the first split
+        new_ctx_out();
+
+        // process tensors one by one
+        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
+        for (int i = 0; i < n_tensors; ++i) {
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            // calculate the "imaginary" size = the current size + next tensor size
+            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
+            size_t next_tensors_size = curr_tensors_size + n_bytes;
+            if (should_split(i, next_tensors_size)) {
+                new_ctx_out();
+                curr_tensors_size = n_bytes;
+            } else {
+                curr_tensors_size = next_tensors_size;
+            }
+            gguf_add_tensor(ctx_out, t);
        }

+        // push the last ctx_out
+        ctx_outs.push_back(ctx_out);
+
+        // set the correct n_split for all ctx_out
+        for (auto & ctx : ctx_outs) {
+            gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
+        }
+    }
+
+    ~split_strategy() {
+        for (auto & ctx_out : ctx_outs) {
+            gguf_free(ctx_out);
+        }
+    }
+
+    bool should_split(int i_tensor, size_t next_size) {
+        if (params.n_bytes_split > 0) {
+            // split by max size per file
+            return next_size > params.n_bytes_split;
+        } else {
+            // split by number of tensors per file
+            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+        }
+    }
+
+    void print_info() {
+        printf("n_split: %ld\n", ctx_outs.size());
+        int i_split = 0;
+        for (auto & ctx_out : ctx_outs) {
+            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
+            size_t total_size = gguf_get_meta_size(ctx_out);
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
+                total_size += ggml_nbytes(t);
+            }
+            total_size = total_size / 1024 / 1024; // convert to megabytes
+            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+            i_split++;
+        }
+    }
+
+    void write() {
+        int i_split = 0;
+        int n_split = ctx_outs.size();
+        for (auto & ctx_out : ctx_outs) {
+            // construct file path
            char split_path[PATH_MAX] = {0};
            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);

-        fprintf(stderr, "%s: %s ...", __func__, split_path);
-        fout = std::ofstream(split_path, std::ios::binary);
+            // open the output file
+            printf("Writing file %s ... ", split_path);
+            fflush(stdout);
+            std::ofstream fout = std::ofstream(split_path, std::ios::binary);
            fout.exceptions(std::ofstream::failbit); // fail fast on write errors

-        auto meta_size = gguf_get_meta_size(ctx_out);
-
-        // placeholder for the meta data
-        ::zeros(fout, meta_size);
-
-        i_split++;
-    }
-
-    void next_tensor() {
-        const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
-        struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
-        auto n_bytes = ggml_nbytes(t);
-
-        if (read_data.size() < n_bytes) {
-            read_data.resize(n_bytes);
-        }
-
-        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
-        f_input.seekg(offset);
-        f_input.read((char *)read_data.data(), n_bytes);
-
-        t->data = read_data.data();
-
-        // write tensor data + padding
-        fout.write((const char *)t->data, n_bytes);
-        zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
-
-        i_tensor++;
-    }
-
-    void split_end() {
-        // go back to beginning of file and write the updated metadata
-        fout.seekp(0);
+            // write metadata
            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
            gguf_get_meta_data(ctx_out, data.data());
            fout.write((const char *)data.data(), data.size());

-        fout.close();
-        gguf_free(ctx_out);
+            // write tensors
+            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
+                // read tensor meta and prepare buffer
+                const char * t_name = gguf_get_tensor_name(ctx_out, i);
+                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+                auto n_bytes = ggml_nbytes(t);
+                read_buf.resize(n_bytes);

-        fprintf(stderr, "\033[3Ddone\n");
+                // calculate offset
+                auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
+                auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+
+                // copy tensor from input to output file
+                copy_file_to_file(f_input, fout, offset, n_bytes);
+                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+            }
+
+            printf("done\n");
+            // close the file
+            fout.close();
+            i_split++;
+        }
+    }
+
+    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
+        // TODO: detect OS and use copy_file_range() here for better performance
+        if (read_buf.size() < len) {
+            read_buf.resize(len);
+        }
+        f_in.seekg(in_offset);
+        f_in.read((char *)read_buf.data(), len);
+        f_out.write((const char *)read_buf.data(), len);
    }
 };

@ -254,32 +353,22 @@ static void gguf_split(const split_params & split_params) {
        exit(EXIT_FAILURE);
    }

+    // prepare the strategy
    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+    int n_split = strategy.ctx_outs.size();
+    strategy.print_info();

-    char first_split_path[PATH_MAX] = {0};
-    llama_split_path(first_split_path, sizeof(first_split_path),
-                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
-    fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
-            __func__, split_params.input.c_str(),
-            first_split_path,
-            split_params.n_split_tensors);
-
-    strategy.split_start();
-
-    while (strategy.i_tensor < strategy.n_tensors) {
-        strategy.next_tensor();
-        if (strategy.should_split()) {
-            strategy.split_end();
-            strategy.split_start();
+    if (!split_params.dry_run) {
+        // write all output splits
+        strategy.write();
    }
-    }
-    strategy.split_end();

+    // done, clean up
    gguf_free(ctx_gguf);
    f_input.close();

    fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
-            __func__, strategy.n_split, strategy.n_tensors);
+            __func__, n_split, strategy.n_tensors);
 }

 static void gguf_merge(const split_params & split_params) {
@ -448,10 +537,6 @@ static void gguf_merge(const split_params & split_params) {
 }

 int main(int argc, const char ** argv) {
-    if (argc < 3) {
-        split_print_usage(argv[0]);
-    }
-
    split_params params;
    split_params_parse(argc, argv, params);

--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1711163522,
-        "narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
+        "lastModified": 1711703276,
+        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
+        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
        "type": "github"
      },
      "original": {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -705,9 +705,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct ggml_tensor * leaf = graph->leafs[i];
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
+        if (leaf->view_src || leaf->data) {
+            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
+            galloc->leaf_allocs[i].leaf.size_max = 0;
+        } else {
            galloc->leaf_allocs[i].leaf.offset = hn->offset;
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
        }
+    }

    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
--- a/ggml-common.h
+++ b/ggml-common.h
@ -447,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_

 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_SYCL)
+
 #include <cstdint>

-#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory<const type, 1> name(sycl::range<1>(size), {
-#define GGML_TABLE_END() });
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };

 #define GGML_COMMON_IMPL
 #endif
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -1,7 +1,8 @@
 #pragma once

-#include "../ggml.h"
-#include "../ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-cuda.h"
+
 #include <memory>

 #if defined(GGML_USE_HIPBLAS)
@ -11,7 +12,7 @@
 #define GGML_COMMON_DECL_CUDA
 #define GGML_COMMON_IMPL_CUDA
 #endif
-#include "../ggml-common.h"
+#include "ggml-common.h"

 #include <cstdio>
 #include <array>
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@ -2,14 +2,6 @@
 #include "dequantize.cuh"
 #include "convert.cuh"

-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 2
 #else
--- a/ggml-cuda/dmmv.cuh
+++ b/ggml-cuda/dmmv.cuh
@ -1,5 +1,16 @@
 #include "common.cuh"

+// dmmv = dequantize_mul_mat_vec
+
+// TODO: remove this?
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
+#endif
+
 void ggml_cuda_op_dequantize_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -8062,7 +8062,7 @@ template <bool need_check> static void
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
 static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
                          const sycl::nd_item<3> &item_ct1,
-                          const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) {
+                          const uint32_t *iq3xxs_grid_ptr=nullptr, const uint64_t *ksigns64_ptr=nullptr) {
    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
                    item_ct1.get_local_id(1);

@ -9939,17 +9939,14 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
                                        dpct::queue_ptr stream) {
    const int nb = k / QK_K;
    {
-        iq2xxs_grid.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);

        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

        stream->submit([&](sycl::handler &cgh) {
-            auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                   sycl::range<3>(1, 1, 32),
@ -9968,17 +9965,14 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
                                       dpct::queue_ptr stream) {
    const int nb = k / QK_K;
    {
-        iq2xs_grid.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);

        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

        stream->submit([&](sycl::handler &cgh) {
-            auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                   sycl::range<3>(1, 1, 32),
@ -9997,17 +9991,14 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
                                        dpct::queue_ptr stream) {
    const int nb = k / QK_K;
    {
-        iq3xxs_grid.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);

        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                   sycl::range<3>(1, 1, 32),
@ -10026,17 +10017,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
                                        dpct::queue_ptr stream) {
    const int nb = k / QK_K;
    {
-        iq3s_grid.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);

        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                   sycl::range<3>(1, 1, 32),
@ -10055,17 +10043,14 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
                                        dpct::queue_ptr stream) {
    const int nb = k / QK_K;
    {
-        iq1s_grid_gpu.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);

        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

        stream->submit([&](sycl::handler &cgh) {
-            auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                   sycl::range<3>(1, 1, 32),
@ -10398,12 +10383,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10411,8 +10392,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
                                      VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10427,12 +10407,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10440,8 +10416,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10456,12 +10431,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10469,8 +10440,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10485,12 +10455,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10498,8 +10464,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10514,12 +10479,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10527,8 +10488,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10543,12 +10503,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10556,8 +10512,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10572,12 +10527,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10585,8 +10536,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10601,12 +10551,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10614,8 +10560,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10630,12 +10575,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10643,8 +10584,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
@ -10659,12 +10599,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10672,13 +10608,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
                    [[intel::reqd_sub_group_size(32)]] {
                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1,
-                            iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1);
+                            vx, vy, dst, ncols, nrows, item_ct1);
                    });
        });
    }
 }

+
 static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
                                          float *dst, const int ncols,
                                          const int nrows,
@ -10688,15 +10624,11 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq2xxs_grid.init(*stream);
-        ksigns_iq2xs.init(*stream);
-        kmask_iq2xs.init(*stream);
-

        stream->submit([&](sycl::handler &cgh) {
-            auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr();
-            auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
-            auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
+            auto iq2xxs_grid_ptr_ct1 = &iq2xxs_grid[0];
+            auto ksigns_iq2xs_ptr_ct1 = &ksigns_iq2xs[0];
+            auto kmask_iq2xs_ptr_ct1 = &kmask_iq2xs[0];

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10719,12 +10651,10 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq2xs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
+            auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
+            auto ksigns64_ptr_ct1 = &ksigns64[0];

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10747,12 +10677,10 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3xxs_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
+            auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
+            auto ksigns64_ptr_ct1 = &ksigns64[0];

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10775,12 +10703,10 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq3s_grid.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
+            auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
+            auto ksigns64_ptr_ct1 = &ksigns64[0];

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
@ -10803,12 +10729,10 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        iq1s_grid_gpu.init(*stream);
-        ksigns64.init(*stream);

        stream->submit([&](sycl::handler &cgh) {
-            auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
-            auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
+            auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
+            auto ksigns64_ptr_ct1 = &ksigns64[0];

            cgh.parallel_for(
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -11,17 +11,6 @@ extern "C" {
 #define GGML_VK_MAX_DEVICES 16

 GGML_API void ggml_vk_instance_init(void);
-GGML_API void ggml_vk_init_cpu_assist(void);
-
-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#endif
-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
-GGML_API void ggml_vk_free_cpu_assist(void);

 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
--- a/ggml.c
+++ b/ggml.c
@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #endif
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif

 // floating point type used to accumulate sums
@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

 #if defined(GGML_USE_CLBLAST)
        ggml_cl_init();
-#elif defined(GGML_USE_VULKAN)
-        ggml_vk_init_cpu_assist();
 #endif

        ggml_setup_op_has_task_pass();
@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
        return;
    }

-#if defined(GGML_USE_VULKAN)
-    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    if (skip_cpu) {
-        ggml_vk_check_results_1_cpu_assist(params, tensor);
-    }
-#endif
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#endif // GGML_USE_VULKAN
-
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        }
    }

-#ifdef GGML_USE_VULKAN
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
-    }
-    ggml_vk_preallocate_buffers_cpu_assist();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
-    }
-#endif
-
    const int n_threads = cplan->n_threads;

    struct ggml_compute_state_shared state_shared = {
@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        }
    }

-#ifdef GGML_USE_VULKAN
-    ggml_vk_graph_cleanup_cpu_assist();
-#endif
-
    // performance stats (graph)
    {
        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@ -18,6 +18,12 @@ shader_int8_ext = """
 """

 # Type-specific defines
+shader_f32_defines = """
+#define QUANT_K 1
+#define QUANT_R 1
+
+#define A_TYPE float
+"""
 shader_f16_defines = """
 #define QUANT_K 1
 #define QUANT_R 1
@ -157,8 +163,8 @@ struct block_q6_K
 """

 # Dequant functions
-shader_f16_dequant_func = """
-#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
+shader_float_dequant_func = """
+#define DEQUANT_FUNC vec2 v = vec2(ib, ib);  // data_a[ib], data_a[ib + 1]);
 """

 shader_q4_0_dequant_func = """
@ -410,6 +416,133 @@ mulmat_load_q8_0 = """
            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""

+
+mulmat_load_q2_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                         // 2 values per idx
+            const uint iqs = idx % 128;                        // 0..127
+
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
+            const uint scalesi = iqs / 8;                      // 0..15
+            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
+
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
+            const uint scales = data_a[ib].scales[scalesi];
+            const vec2 d = vec2(data_a[ib].d);
+
+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);"""
+
+mulmat_load_q3_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                   // 2 values per idx
+            const uint iqs = idx % 128;                  // 0..127
+
+            const uint n = iqs / 64;                     // 0,1
+            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
+            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
+            const uint j = (iqs % 64) / 4;               // 0..3
+            const uint is = iqs / 8;                     // 0..15
+            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
+            const uint qsshift = halfsplit * 2;          // 0,2,4,6
+            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
+
+            const int8_t us = int8_t(is <  4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) :
+                                    is <  8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) :
+                                    is < 12 ? (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) :
+                                            (data_a[ib].scales[is-8] >>  4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4));
+            const float dl = float(data_a[ib].d) * float(us - 32);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));"""
+
+mulmat_load_q4_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            uint8_t sc;
+            uint8_t mbyte;
+            if (is < 4) {
+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
+            } else {
+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
+            }
+            const float d = loadd.x * sc;
+            const float m = loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) - m);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);"""
+
+mulmat_load_q5_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
+
+            const uint8_t hm = uint8_t(1 << (iqs / 16));
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            uint8_t sc;
+            uint8_t mbyte;
+            if (is < 4) {
+                sc    = uint8_t(data_a[ib].scales[is    ] & 63);
+                mbyte = uint8_t(data_a[ib].scales[is + 4] & 63);
+            } else {
+                sc    = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4));
+                mbyte = uint8_t((data_a[ib].scales[is + 4] >>  4) | ((data_a[ib].scales[is    ] >> 6) << 4));
+            }
+            const float d = loadd.x * sc;
+            const float m = loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0)) - m);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);"""
+
+mulmat_load_q6_K = """
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = idx % 128;                 // 0..127
+
+            const uint n = iqs / 64;                    // 0,1
+            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint is_b = (iqs % 16) / 8;           // 0,1
+            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
+            const uint is = 8 * n + qhshift + is_b;     // 0..15
+            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
+            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+
+            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));"""
+
 mulmat_body2 = """
        }
        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
@ -1611,8 +1744,9 @@ layout (push_constant) uniform parameter
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint d_offset;
    float param1; float param2;
-} p;
+} p;"""

+generic_unary_op_funcs = """
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -1636,14 +1770,17 @@ uint dst_idx(uint idx) {
    const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
    return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
-}
+}"""

+generic_unary_op_main = """
 void main() {
    if (gl_GlobalInvocationID.x >= p.ne) {
        return;
    }
 """

+generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_funcs}\n{generic_unary_op_main}"
+
 generic_binary_op_head = """#version 450

 #extension GL_EXT_shader_16bit_storage : require
@ -1655,13 +1792,14 @@ layout (push_constant) uniform parameter
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
    uint d_offset;
-    uint param1; uint param2;
-} p;
+    float param1; float param2;
+} p;"""

+generic_binary_op_funcs = """
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;

 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer B {A_TYPE data_b[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

 uint src0_idx(uint idx) {
@ -1693,14 +1831,17 @@ uint dst_idx(uint idx) {
    const uint i21 = (idx - i23_offset - i22_offset) / p.ne20;
    const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20;
    return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20;
-}
+}"""

+generic_binary_op_main = """
 void main() {
    if (gl_GlobalInvocationID.x >= p.ne) {
        return;
    }
 """

+generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_funcs}\n{generic_binary_op_main}"
+
 # MUL F32
 mul_body = """
    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
@ -1745,39 +1886,55 @@ cpy_f16_f16_end = """
 """

 # GET_ROWS
-get_rows_body = """
-#extension GL_EXT_control_flow_attributes : enable
-#extension GL_EXT_shader_8bit_storage : require
-
-layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_b[];};
-layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
-
+get_rows_float_body = """
 void main() {
-    const uint col = int(gl_GlobalInvocationID.x) * 2;
-    const uint row = int(gl_GlobalInvocationID.y);
+    const uint i00 = gl_GlobalInvocationID.x;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;

-    if (col >= p.KY) {
+    if (i00 >= p.ne00) {
        return;
    }

-    const uint r = uint(data_b[row]);
+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];

-    // copy data_a[r*p.KY + col] to dst[row*p.KX + col]
-    const uint xi = r*p.KY + col;
-    const uint di = row*p.KY + col;
+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;

-    const uint ib = xi/QUANT_K; // block index
-    const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index
-    const uint iybs = di - di%QUANT_K; // y block start index
+#ifndef OPTIMIZATION_ERROR_WORKAROUND
+    data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
+#else
+    data_d[d_offset + i00] = data_a[a_offset + i00];
+#endif
+}
+"""
+
+get_rows_body = """
+void main() {
+    const uint i00 = (gl_GlobalInvocationID.x)*2;
+    const uint i10 = gl_GlobalInvocationID.y;
+    const uint i11 = (gl_GlobalInvocationID.z)/p.ne12;
+    const uint i12 = (gl_GlobalInvocationID.z)%p.ne12;
+
+    if (i00 >= p.ne00) {
+        return;
+    }
+
+    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+
+    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
+
+    const uint ib = a_offset + i00/QUANT_K; // block index
+    const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index
+    const uint iybs = i00 - i00%QUANT_K; // dst block start index
    const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;

    DEQUANT_FUNC

-    dst[iybs + iqs + 0]        = D_TYPE(v.x);
-    dst[iybs + iqs + y_offset] = D_TYPE(v.y);
+    data_d[d_offset + iybs + iqs           ] = D_TYPE(v.x);
+    data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
 }
 """

@ -2418,6 +2575,31 @@ async def main():
        tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
        tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))

+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q2_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q2_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q3_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q3_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q4_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q4_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q5_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q5_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
+        stream.clear()
+        stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2))
+        tasks.append(string_to_spv("matmul_q6_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
+        tasks.append(string_to_spv("matmul_q6_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
+
    # Shaders where precision is needed, so no fp16 version

    # mul mat vec
@ -2426,7 +2608,7 @@ async def main():
        stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))

        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
+            stream.extend((shader_f16_defines, shader_float_dequant_func, mul_mat_vec_body))
        elif i == GGML_TYPE_Q4_0:
            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
        elif i == GGML_TYPE_Q4_1:
@ -2488,25 +2670,32 @@ async def main():
    # get_rows
    for i in range(0, VK_NUM_TYPES):
        stream.clear()
-        stream.extend((generic_head, shader_int8_ext, shader_f32))
+        stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32))
+        optimization_workaround = False

-        if i == GGML_TYPE_F16:
-            stream.extend((shader_f16_defines,  shader_f16_dequant_func,  get_rows_body))
+        if i == GGML_TYPE_F32:
+            stream.extend((shader_f32_defines, generic_binary_op_funcs, get_rows_float_body))
+        elif i == GGML_TYPE_F16:
+            stream.extend((shader_f16_defines, generic_binary_op_funcs, get_rows_float_body))
+            optimization_workaround = True
        elif i == GGML_TYPE_Q4_0:
-            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
+            stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body))
        elif i == GGML_TYPE_Q4_1:
-            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
+            stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body))
        elif i == GGML_TYPE_Q5_0:
-            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
+            stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body))
        elif i == GGML_TYPE_Q5_1:
-            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
+            stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body))
        elif i == GGML_TYPE_Q8_0:
-            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
+            stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body))
        else:
            continue

-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
-        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
+        if optimization_workaround:
+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
+        else:
+            tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"}))
+        tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"}))

    tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
@ -2515,20 +2704,20 @@ async def main():
    tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))

-    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
-    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
-    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
+    tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
+    tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))

-    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))

    tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
-    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))

-    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))

-    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))

-    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
+    tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))

    tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -123,6 +123,7 @@ class MODEL_ARCH(IntEnum):
    GEMMA      = auto()
    STARCODER2 = auto()
    MAMBA      = auto()
+    XVERSE     = auto()
    COMMAND_R  = auto()


@ -191,6 +192,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.GEMMA:          "gemma",
    MODEL_ARCH.STARCODER2:     "starcoder2",
    MODEL_ARCH.MAMBA:          "mamba",
+    MODEL_ARCH.XVERSE:         "xverse",
    MODEL_ARCH.COMMAND_R:      "command-r",
 }

@ -606,6 +608,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.SSM_D,
        MODEL_TENSOR.SSM_OUT,
    ],
+    MODEL_ARCH.XVERSE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    MODEL_ARCH.COMMAND_R: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@ -650,6 +668,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
+    MODEL_ARCH.XVERSE: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }

 #
--- a/llama.cpp
+++ b/llama.cpp
@ -218,6 +218,7 @@ enum llm_arch {
    LLM_ARCH_GEMMA,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
+    LLM_ARCH_XVERSE,
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_UNKNOWN,
 };
@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_GEMMA,           "gemma"      },
    { LLM_ARCH_STARCODER2,      "starcoder2" },
    { LLM_ARCH_MAMBA,           "mamba"      },
+    { LLM_ARCH_XVERSE,          "xverse"     },
    { LLM_ARCH_COMMAND_R,       "command-r"  },
    { LLM_ARCH_UNKNOWN,         "(unknown)"  },
 };
@ -878,6 +880,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
        },
    },
+    {
+        LLM_ARCH_XVERSE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_COMMAND_R,
        {
@ -2100,10 +2121,6 @@ struct llama_context {
            ggml_backend_free(backend);
        }

-#ifdef GGML_USE_VULKAN
-        ggml_vk_free_cpu_assist();
-#endif
-
        ggml_backend_buffer_free(buf_output);
    }

@ -3847,6 +3864,16 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_XVERSE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    case 80: model.type = e_model::MODEL_65B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
        case LLM_ARCH_COMMAND_R:
            {
                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@ -5200,6 +5227,28 @@ static bool llm_load_tensors(
                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
                    }
                } break;
+            case LLM_ARCH_XVERSE:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+                        auto & layer = model.layers[i];
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
            case LLM_ARCH_COMMAND_R:
                {
                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -5238,7 +5287,7 @@ static bool llm_load_tensors(

    ml.done_getting_tensors();

-    ml.init_mappings(true, &model.mlock_mmaps);
+    ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
    model.mappings.reserve(ml.mappings.size());

    // create the backend buffers
@ -5523,8 +5572,8 @@ static void llm_build_kv_store(
    GGML_ASSERT(kv.size == n_ctx);

    // compute the transposed [n_tokens, n_embd] V matrix
-    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
-    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
    cb(v_cur_t, "v_cur_t", il);

    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
@ -6411,6 +6460,111 @@ struct llm_build_context {
        return gf;
    }

+    struct ggml_cgraph * build_xverse() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        // positions of the tokens in the KV cache
+        struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
    struct ggml_cgraph * build_falcon() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

@ -9389,6 +9543,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_mamba();
            } break;
+        case LLM_ARCH_XVERSE:
+            {
+                result = llm.build_xverse();
+            } break;
        case LLM_ARCH_COMMAND_R:
            {
                result = llm.build_command_r();
@ -13969,7 +14127,20 @@ struct llama_context * llama_new_context_with_model(
            }
        }
 #elif defined(GGML_USE_VULKAN)
-        if (model->n_gpu_layers > 0) {
+        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
+            llama_free(ctx);
+            return nullptr;
+        }
+        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
+            ggml_backend_t backend = ggml_backend_vk_init(0);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        } else {
            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
                ggml_backend_t backend = ggml_backend_vk_init(device);
                if (backend == nullptr) {
@ -14188,6 +14359,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
+        case LLM_ARCH_XVERSE:
        case LLM_ARCH_COMMAND_R:
            return LLAMA_ROPE_TYPE_NORM;

--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@ -178,6 +178,9 @@ def get_commit_hexsha8(name):
    for t in repo.tags:
        if t.name == name:
            return t.commit.hexsha[:8]
+    for c in repo.iter_commits("--all"):
+        if c.hexsha[:8] == name[:8]:
+            return c.hexsha[:8]
    return None


@ -224,7 +227,7 @@ if known_args.compare is not None:
        hexsha8_compare = get_commit_hexsha8(known_args.compare)
        name_compare = known_args.compare
    if hexsha8_compare is None:
-        print(f"ERROR: cannot find data for baseline={known_args.compare}.")
+        print(f"ERROR: cannot find data for compare={known_args.compare}.")
        sys.exit(1)
 # Otherwise, search for the commit for llama-bench was most recently run
 # and that is not a parent of master:
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -95,6 +95,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
    # src/ggml-backend.c          -> ggml-backend.c
    # src/ggml-common.h           -> ggml-common.h
+    # src/ggml-cuda/*             -> ggml-cuda/
    # src/ggml-cuda.cu            -> ggml-cuda.cu
    # src/ggml-cuda.h             -> ggml-cuda.h
    # src/ggml-impl.h             -> ggml-impl.h
@ -128,6 +129,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
        -e 's/src\/ggml-common\.h/ggml-common.h/g' \
+        -e 's/src\/ggml-cuda\//ggml-cuda\//g' \
        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -5,6 +5,7 @@ cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
 cp -rpv ../ggml/src/ggml-backend-impl.h     ./ggml-backend-impl.h
 cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
 cp -rpv ../ggml/src/ggml-common.h           ./ggml-common.h
+cp -rpv ../ggml/src/ggml-cuda/*             ./ggml-cuda/
 cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
 cp -rpv ../ggml/src/ggml-impl.h             ./ggml-impl.h