Merge upstream changes, fix conflicts

2024-02-25 17:35:05 +01:00 · 2024-02-25 17:35:05 +01:00 · 93cdea1d7b
commit 93cdea1d7b
parent aa0f428e2a 7d548a1827
132 changed files with 13710 additions and 4764 deletions
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@ -0,0 +1,37 @@
 {
  lib,
  dockerTools,
  buildEnv,
  llama-cpp,
  interactive ? true,
  coreutils,
 }:
 # A tar that can be fed into `docker load`:
 #
 # $ nix build .#llamaPackages.docker
 # $ docker load < result
 # For details and variations cf.
 # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
 # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
 # - https://nixery.dev/
 # Approximate (compressed) sizes, at the time of writing, are:
 #
 # .#llamaPackages.docker: 125M;
 # .#llamaPackagesCuda.docker: 537M;
 # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
 dockerTools.buildLayeredImage {
  name = llama-cpp.pname;
  tag = "latest";
  contents =
    [ llama-cpp ]
    ++ lib.optionals interactive [
      coreutils
      dockerTools.binSh
      dockerTools.caCertificates
    ];
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
+      broken = (useMetalKit && !effectiveStdenv.isDarwin);
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -12,5 +12,8 @@ lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
    docker = self.callPackage ./docker.nix { };
    docker-min = self.callPackage ./docker.nix { interactive = false; };
    sif = self.callPackage ./sif.nix { };
  }
 )
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@ -0,0 +1,27 @@
 {
  lib,
  singularity-tools,
  llama-cpp,
  bashInteractive,
  interactive ? false,
 }:
 let
    optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
  inherit (llama-cpp) name;
  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
  # These are excessive (but safe) for most variants. Building singularity
  # images requires superuser privileges, so we build them inside a VM in a
  # writable image of pre-determined size.
  #
  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
  #
  # Expected image sizes:
  # - cpu/blas: 150M,
  # - cuda, all gencodes: 560M,
  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
  memSize = diskSize;
 }
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,3 @@
 [flake8]
 max-line-length = 125
 ignore = W503
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@ -7,3 +7,5 @@ assignees: ''
 ---
 Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
 If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -37,6 +37,8 @@ jobs:
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          CC=gcc-8 make -j $(nproc)
@ -65,7 +67,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON
          cmake --build . --config Release -j $(nproc)
      - name: Test
@ -100,7 +102,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
@ -244,6 +246,8 @@ jobs:
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
@ -277,7 +281,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_METAL=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -19,7 +19,6 @@ on:
 jobs:
  nix-build-aarch64:
    if: ${{ vars.CACHIX_NAME != '' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
@ -37,8 +36,8 @@ jobs:
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -46,7 +45,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -23,8 +23,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -37,7 +37,6 @@ jobs:
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    if: ${{ vars.CACHIX_NAME != '' }}
    strategy:
      fail-fast: false
      matrix:
@ -51,8 +50,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@ -60,7 +59,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -16,5 +16,5 @@ jobs:
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
+            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -0,0 +1,127 @@
 # Server build and tests
 name: Server
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
      - test/server-add-ci-test # FIXME remove
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
        include:
          - build: 'noavx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
            image: ubuntu:latest
          - build: 'avx2'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
            image: ubuntu:latest
          - build: 'avx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
            image: ubuntu:latest
          - build: 'avx512'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
            image: ubuntu:latest
            experimental: true
          - build: 'cublas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
            image: nvidia/cuda:12.3.1-devel-ubuntu22.04
            arch_not_available: true # require nvidia docker engine
          - build: 'clblast'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
            image: ubuntu:latest
            arch_not_available: true
          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
            image: ubuntu:latest
          - build: 'kompute'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
            image: ubuntu:latest
            arch_not_available: true
          - build: 'vulkan'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
            image: ubuntu:latest
            arch_not_available: true
    container:
      image: ${{ matrix.image }}
      ports:
        - 8888
      options: --cpus 4
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get -y install \
            build-essential \
            pkg-config \
            git \
            cmake \
            python3-pip \
            wget \
            psmisc
      - name: Download CLBlast
        id: get_clblast
        if: ${{ matrix.build == 'clblast' }}
        run: |
          apt install -y libclblast-dev
      - name: Download OpenBLAS
        id: get_openblas
        if: ${{ matrix.build == 'openblas' }}
        run: |
          apt-get -y install libopenblas-dev
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
        run: |
          wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
          wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          apt-get update
          apt-get -y install vulkan-sdk
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Download models
        id: download_models
        run: |
          cd examples/server/tests
          ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
      - name: Tests
        id: server_integration_test
        continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
--- a/.gitignore
+++ b/.gitignore
@ -23,11 +23,13 @@
 .clang-tidy
 .vs/
 .vscode/
 .idea/
 lcov-report/
 gcovr-report/
 build*
 cmake-build-*
 out/
 tmp/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -55,6 +55,9 @@ option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"
 option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
 option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
 # build
 option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)
 # sanitizers
 option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
 option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
@ -107,22 +110,20 @@ option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
 if (LLAMA_PERF)
    add_definitions(-DGGML_PERF)
 endif()
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -130,6 +131,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
 if (LLAMA_SYCL)
    set(CMAKE_CXX_STANDARD 17)
 else()
@ -140,6 +142,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
@ -151,17 +154,17 @@ endif()
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        add_compile_options(-fsanitize=thread)
-        link_libraries(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
    endif()
    if (LLAMA_SANITIZE_ADDRESS)
        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries(-fsanitize=address)
+        link_libraries     (-fsanitize=address)
    endif()
    if (LLAMA_SANITIZE_UNDEFINED)
        add_compile_options(-fsanitize=undefined)
-        link_libraries(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
    endif()
 endif()
@ -199,6 +202,29 @@ if (LLAMA_METAL)
    # copy ggml-metal.metal to bin directory
    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
    if (LLAMA_METAL_EMBED_LIBRARY)
        enable_language(ASM)
        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
        set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
        add_custom_command(
            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
            DEPENDS ${METALLIB_SOURCE}
            COMMENT "Generate assembly for embedded Metal library"
        )
        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
    endif()
    if (LLAMA_METAL_SHADER_DEBUG)
        # custom command to do the following:
        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
@ -298,14 +324,17 @@ if (LLAMA_BLAS)
        endif()
        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -330,9 +359,6 @@ if (LLAMA_CUBLAS)
        set(GGML_SOURCES_CUDA ggml-cuda.cu)
        add_compile_definitions(GGML_USE_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
 #            add_compile_definitions(GGML_CUDA_CUBLAS)
 #        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -387,15 +413,20 @@ if (LLAMA_MPI)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.c)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
            add_compile_options(-Wno-cast-qual)
        endif()
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
        # C++ functions, so more linkage is needed
        if (MPI_CXX_FOUND)
@ -427,31 +458,28 @@ if (LLAMA_VULKAN)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")
-        add_library(ggml-vulkan OBJECT ggml-vulkan.cpp ggml-vulkan.h)
+        set(GGML_HEADERS_VULKAN ggml-vulkan.h)
-        if (BUILD_SHARED_LIBS)
+        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
            set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
        endif()
        target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
        add_compile_definitions(GGML_USE_VULKAN)
        if (LLAMA_VULKAN_CHECK_RESULTS)
-            target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS)
+            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
        endif()
        if (LLAMA_VULKAN_DEBUG)
-            target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG)
+            add_compile_definitions(GGML_VULKAN_DEBUG)
        endif()
        if (LLAMA_VULKAN_VALIDATE)
-            target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE)
+            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
        if (LLAMA_VULKAN_RUN_TESTS)
-            target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS)
+            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
        endif()
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
    else()
        message(WARNING "Vulkan not found")
    endif()
@ -463,43 +491,45 @@ if (LLAMA_HIPBLAS)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
-    find_package(hip)
+    find_package(hip     REQUIRED)
-    find_package(hipblas)
+    find_package(hipblas REQUIRED)
-    find_package(rocblas)
+    find_package(rocblas REQUIRED)
-    if (${hipblas_FOUND} AND ${hip_FOUND})
+    message(STATUS "HIP and hipBLAS found")
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        if (LLAMA_HIP_UMA)
            add_compile_definitions(GGML_HIP_UMA)
        endif()
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (BUILD_SHARED_LIBS)
            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        if (LLAMA_CUDA_FORCE_MMQ)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-        if (LLAMA_STATIC)
+    set(GGML_HEADERS_ROCM ggml-cuda.h)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+    set(GGML_SOURCES_ROCM ggml-cuda.cu)
-        endif()
+
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-    else()
+
-        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+    if (LLAMA_HIP_UMA)
        add_compile_definitions(GGML_HIP_UMA)
    endif()
    if (LLAMA_CUDA_FORCE_DMMV)
        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
    endif()
    if (LLAMA_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
    set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (LLAMA_SYCL)
@ -509,10 +539,14 @@ if (LLAMA_SYCL)
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    message(STATUS "SYCL found")
    add_compile_definitions(GGML_USE_SYCL)
    if (LLAMA_SYCL_F16)
        add_compile_definitions(GGML_SYCL_F16)
    endif()
    add_compile_definitions(GGML_USE_SYCL)
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -521,7 +555,7 @@ if (LLAMA_SYCL)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+    set(GGML_HEADERS_SYCL ggml-sycl.h)
    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
    if (WIN32)
@ -540,61 +574,61 @@ if (LLAMA_KOMPUTE)
    endif()
    function(compile_shader)
-      set(options)
+        set(options)
-      set(oneValueArgs)
+        set(oneValueArgs)
-      set(multiValueArgs SOURCES)
+        set(multiValueArgs SOURCES)
-      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-      foreach(source ${compile_shader_SOURCES})
+        foreach(source ${compile_shader_SOURCES})
-        get_filename_component(filename ${source} NAME)
+            get_filename_component(filename ${source} NAME)
-        set(spv_file ${filename}.spv)
+            set(spv_file ${filename}.spv)
-        add_custom_command(
+            add_custom_command(
-            OUTPUT ${spv_file}
+                OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
-              COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMENT "Compiling ${source} to ${spv_file}"
+                COMMENT "Compiling ${source} to ${spv_file}"
-        )
+                )
-        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-        set(FILE_NAME "shader${RAW_FILE_NAME}")
+            set(FILE_NAME "shader${RAW_FILE_NAME}")
-        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            if(CMAKE_GENERATOR MATCHES "Visual Studio")
-            add_custom_command(
+                add_custom_command(
-              OUTPUT ${OUTPUT_HEADER_FILE}
+                    OUTPUT ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              DEPENDS ${spv_file} xxd
+                    DEPENDS ${spv_file} xxd
-              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-            )
+                    )
-        else()
+            else()
-            add_custom_command(
+                add_custom_command(
-              OUTPUT ${OUTPUT_HEADER_FILE}
+                    OUTPUT ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-              DEPENDS ${spv_file} xxd
+                    DEPENDS ${spv_file} xxd
-              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-            )
+                    )
-        endif()
+            endif()
-      endforeach()
+        endforeach()
    endfunction()
    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
@ -604,66 +638,66 @@ if (LLAMA_KOMPUTE)
        # Compile our shaders
        compile_shader(SOURCES
-          kompute-shaders/op_scale.comp
+            kompute-shaders/op_scale.comp
-          kompute-shaders/op_scale_8.comp
+            kompute-shaders/op_scale_8.comp
-          kompute-shaders/op_add.comp
+            kompute-shaders/op_add.comp
-          kompute-shaders/op_addrow.comp
+            kompute-shaders/op_addrow.comp
-          kompute-shaders/op_mul.comp
+            kompute-shaders/op_mul.comp
-          kompute-shaders/op_silu.comp
+            kompute-shaders/op_silu.comp
-          kompute-shaders/op_relu.comp
+            kompute-shaders/op_relu.comp
-          kompute-shaders/op_gelu.comp
+            kompute-shaders/op_gelu.comp
-          kompute-shaders/op_softmax.comp
+            kompute-shaders/op_softmax.comp
-          kompute-shaders/op_norm.comp
+            kompute-shaders/op_norm.comp
-          kompute-shaders/op_rmsnorm.comp
+            kompute-shaders/op_rmsnorm.comp
-          kompute-shaders/op_diagmask.comp
+            kompute-shaders/op_diagmask.comp
-          kompute-shaders/op_mul_mat_mat_f32.comp
+            kompute-shaders/op_mul_mat_mat_f32.comp
-          kompute-shaders/op_mul_mat_f16.comp
+            kompute-shaders/op_mul_mat_f16.comp
-          kompute-shaders/op_mul_mat_q8_0.comp
+            kompute-shaders/op_mul_mat_q8_0.comp
-          kompute-shaders/op_mul_mat_q4_0.comp
+            kompute-shaders/op_mul_mat_q4_0.comp
-          kompute-shaders/op_mul_mat_q4_1.comp
+            kompute-shaders/op_mul_mat_q4_1.comp
-          kompute-shaders/op_mul_mat_q6_k.comp
+            kompute-shaders/op_mul_mat_q6_k.comp
-          kompute-shaders/op_getrows_f16.comp
+            kompute-shaders/op_getrows_f16.comp
-          kompute-shaders/op_getrows_q4_0.comp
+            kompute-shaders/op_getrows_q4_0.comp
-          kompute-shaders/op_getrows_q4_1.comp
+            kompute-shaders/op_getrows_q4_1.comp
-          kompute-shaders/op_getrows_q6_k.comp
+            kompute-shaders/op_getrows_q6_k.comp
-          kompute-shaders/op_rope_f16.comp
+            kompute-shaders/op_rope_f16.comp
-          kompute-shaders/op_rope_f32.comp
+            kompute-shaders/op_rope_f32.comp
-          kompute-shaders/op_cpy_f16_f16.comp
+            kompute-shaders/op_cpy_f16_f16.comp
-          kompute-shaders/op_cpy_f16_f32.comp
+            kompute-shaders/op_cpy_f16_f32.comp
-          kompute-shaders/op_cpy_f32_f16.comp
+            kompute-shaders/op_cpy_f32_f16.comp
-          kompute-shaders/op_cpy_f32_f32.comp
+            kompute-shaders/op_cpy_f32_f32.comp
        )
        # Create a custom target for our generated shaders
        add_custom_target(generated_shaders DEPENDS
-          shaderop_scale.h
+            shaderop_scale.h
-          shaderop_scale_8.h
+            shaderop_scale_8.h
-          shaderop_add.h
+            shaderop_add.h
-          shaderop_addrow.h
+            shaderop_addrow.h
-          shaderop_mul.h
+            shaderop_mul.h
-          shaderop_silu.h
+            shaderop_silu.h
-          shaderop_relu.h
+            shaderop_relu.h
-          shaderop_gelu.h
+            shaderop_gelu.h
-          shaderop_softmax.h
+            shaderop_softmax.h
-          shaderop_norm.h
+            shaderop_norm.h
-          shaderop_rmsnorm.h
+            shaderop_rmsnorm.h
-          shaderop_diagmask.h
+            shaderop_diagmask.h
-          shaderop_mul_mat_mat_f32.h
+            shaderop_mul_mat_mat_f32.h
-          shaderop_mul_mat_f16.h
+            shaderop_mul_mat_f16.h
-          shaderop_mul_mat_q8_0.h
+            shaderop_mul_mat_q8_0.h
-          shaderop_mul_mat_q4_0.h
+            shaderop_mul_mat_q4_0.h
-          shaderop_mul_mat_q4_1.h
+            shaderop_mul_mat_q4_1.h
-          shaderop_mul_mat_q6_k.h
+            shaderop_mul_mat_q6_k.h
-          shaderop_getrows_f16.h
+            shaderop_getrows_f16.h
-          shaderop_getrows_q4_0.h
+            shaderop_getrows_q4_0.h
-          shaderop_getrows_q4_1.h
+            shaderop_getrows_q4_1.h
-          shaderop_getrows_q6_k.h
+            shaderop_getrows_q6_k.h
-          shaderop_rope_f16.h
+            shaderop_rope_f16.h
-          shaderop_rope_f32.h
+            shaderop_rope_f32.h
-          shaderop_cpy_f16_f16.h
+            shaderop_cpy_f16_f16.h
-          shaderop_cpy_f16_f32.h
+            shaderop_cpy_f16_f32.h
-          shaderop_cpy_f32_f16.h
+            shaderop_cpy_f32_f16.h
-          shaderop_cpy_f32_f32.h
+            shaderop_cpy_f32_f32.h
        )
        # Create a custom command that depends on the generated_shaders
@ -676,8 +710,10 @@ if (LLAMA_KOMPUTE)
        # Add the stamp to the main sources to ensure dependency tracking
        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-kompute.h   ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
        add_compile_definitions(GGML_USE_KOMPUTE)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
    else()
@ -685,6 +721,18 @@ if (LLAMA_KOMPUTE)
    endif()
 endif()
 if (LLAMA_CPU_HBM)
    find_library(memkind memkind REQUIRED)
    add_compile_definitions(GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 if (LLAMA_PERF)
    add_compile_definitions(GGML_PERF)
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -709,28 +757,30 @@ function(get_flags CCID CCVER)
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
    elseif (CCID MATCHES "Intel")
        if (NOT LLAMA_SYCL)
            # enable max optimization level when using Intel compiler
            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
            add_link_options(-fuse-ld=lld -static-intel)
        endif()
    endif()
    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()
 if (LLAMA_FATAL_WARNINGS)
    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        list(APPEND C_FLAGS   -Werror)
        list(APPEND CXX_FLAGS -Werror)
    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
        add_compile_options(/WX)
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                          -Werror=implicit-int -Werror=implicit-function-declaration)
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
@ -746,9 +796,10 @@ endif()
 set(CUDA_CXX_FLAGS "")
 if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
+    set(CUDA_FLAGS -use_fast_math)
-    if (NOT MSVC)
+
-        list(APPEND CUDA_FLAGS -Wno-pedantic)
+    if (LLAMA_FATAL_WARNINGS)
        list(APPEND CUDA_FLAGS -Werror all-warnings)
    endif()
    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
@ -782,7 +833,11 @@ if (LLAMA_CUBLAS)
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
    endif()
    if (NOT MSVC)
        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
    endif()
 endif()
@ -821,6 +876,7 @@ execute_process(
    ERROR_VARIABLE output
    OUTPUT_QUIET
 )
 if (output MATCHES "dyld-1015\.7")
    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
 endif()
@ -830,10 +886,10 @@ endif()
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (MSVC)
-  string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-  message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
 else ()
-  set(CMAKE_GENERATOR_PLATFORM_LWR "")
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
 endif ()
 if (NOT MSVC)
@ -855,11 +911,21 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
    message(STATUS "ARM detected")
    if (MSVC)
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)
-        add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
-        # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
        endif ()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
@ -870,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            # Raspberry Pi 2
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
            endif()
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
@ -1017,11 +1089,6 @@ endif()
 # ggml
 if (GGML_USE_CPU_HBM)
    add_definitions(-DGGML_USE_CPU_HBM)
    find_library(memkind memkind REQUIRED)
 endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
@ -1038,16 +1105,17 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
            )
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
-target_compile_features(ggml PUBLIC c_std_11) # don't bump
+target_compile_features   (ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 if (GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
@ -1063,7 +1131,8 @@ add_library(llama
            )
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
@ -1114,7 +1183,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
+        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
--- a/62
+++ b/62
@ -97,9 +97,10 @@ endif
 #
 # keep standard at C11 and C++11
-MK_CPPFLAGS = -I. -Icommon
+MK_CPPFLAGS  = -I. -Icommon
-MK_CFLAGS   = -std=c11   -fPIC
+MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
 	MK_LDFLAGS  += -g
 	ifeq ($(UNAME_S),Linux)
-		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
+		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
@ -215,6 +216,11 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
 ifeq ($(LLAMA_FATAL_WARNINGS),1)
 	MK_CFLAGS   += -Werror
 	MK_CXXFLAGS += -Werror
 endif
 # this version of Apple ld64 is buggy
 ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
 	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@ -379,6 +385,9 @@ ifdef LLAMA_CUBLAS
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
@ -437,9 +446,9 @@ ifdef LLAMA_CUDA_CCBIN
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
-	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # JETSON_EOL_MODULE_DETECT
 endif # LLAMA_CUBLAS
@ -524,11 +533,29 @@ ifdef LLAMA_METAL
 ifdef LLAMA_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef LLAMA_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
 	OBJS        += ggml-metal-embed.o
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef LLAMA_METAL_EMBED_LIBRARY
 ggml-metal-embed.o: ggml-metal.metal
 	@echo "Embedding Metal library"
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
 	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
 	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_MPI
@ -540,9 +567,10 @@ GF_CC := $(CC)
 include scripts/get-flags.mk
 # combine build flags with cmdline overrides
-override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
-BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
+BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
 override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
 override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
@ -550,7 +578,7 @@ override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 ifdef LLAMA_CUBLAS
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(GF_CXXFLAGS)
+CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 #
@ -569,6 +597,14 @@ $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
 $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
 endif # LLAMA_CUBLAS
 $(info )
@ -683,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -854,3 +890,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
 tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -13,17 +13,31 @@ let package = Package(
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
    dependencies: [
        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
    ],
    targets: [
        .target(
            name: "llama",
            dependencies: ["ggml"],
            path: ".",
-            exclude: ["ggml-metal.metal"],
+            exclude: [
               "cmake",
               "examples",
               "scripts",
               "models",
               "tests",
               "CMakeLists.txt",
               "ggml-cuda.cu",
               "ggml-cuda.h",
               "Makefile"
            ],
            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
                "ggml-metal.m",
            ],
            resources: [
                .process("ggml-metal.metal")
            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
--- a/README-sycl.md
+++ b/README-sycl.md
@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-Recommend to install to default folder: **/opt/intel/oneapi**.
+Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
 Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
--- a/README.md
+++ b/README.md
@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Hot topics
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
+- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
+- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
-  - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
+- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 ----
@ -61,7 +57,7 @@ variety of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
+- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
 - Vulkan, SYCL, and (partial) OpenCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 **Multimodal models:**
@ -145,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
@ -156,6 +154,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Msty](https://msty.app) (proprietary)
 ---
@ -768,7 +768,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
 #### How to run
-1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
@ -958,7 +958,7 @@ We have three Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
 Additionally, there the following images, similar to the above:
--- a/build.zig
+++ b/build.zig
@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -33,7 +33,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
-CMAKE_EXTRA=""
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
@ -219,7 +219,7 @@ function gg_run_open_llama_3b_v2 {
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
@ -401,7 +401,7 @@ function gg_run_open_llama_7b_v2 {
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/open-llama/7B-v2"
@ -568,6 +568,54 @@ function gg_sum_open_llama_7b_v2 {
    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 # bge-small
 function gg_run_embd_bge_small {
    cd ${SRC}
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
    path_models="../models-mnt/bge-small"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert-hf-to-gguf.py ${path_models}
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    set +e
 }
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'BGE Small (BERT):\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
 }
 ## main
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@ -591,6 +639,8 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
--- a/common/common.cpp
+++ b/common/common.cpp
@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
            else { invalid_param = true; break; }
        } else if (arg == "--rope-scale") {
            if (++i >= argc) {
@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+            const auto sampler_names = string_split(argv[i], ';');
            sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
        } else if (arg == "--sampling-seq") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.samplers_sequence = argv[i];
+            sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@ -629,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            std::string arg_next = argv[i];
            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_NONE;
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_LAYER;
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
            } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_ROW;
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
            } else {
                invalid_param = true;
                break;
@ -670,7 +671,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
-            params.numa = true;
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
            else { invalid_param = true; break; }
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "--no-display-prompt") {
@ -828,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            sep++;
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_INT;
+                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
                kvo.int_value = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
+                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
                kvo.float_value = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
+                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
                    kvo.bool_value = true;
                } else if (std::strcmp(sep, "false") == 0) {
@ -906,6 +915,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;
    std::string sampler_type_chars;
    std::string sampler_type_names;
    for (const auto sampler_type : sparams.samplers_sequence) {
        sampler_type_chars += static_cast<char>(sampler_type);
        sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
    }
    sampler_type_names.pop_back();
    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
@ -926,7 +943,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  -td N, --threads-draft N");
-    printf("                        number of threads to use during generation (default: same as --threads)");
+    printf("                        number of threads to use during generation (default: same as --threads)\n");
    printf("  -tbd N, --threads-batch-draft N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
    printf("  -p PROMPT, --prompt PROMPT\n");
@ -947,8 +964,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
+    printf("                        (default: %s)\n", sampler_type_names.c_str());
    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -996,7 +1014,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
+    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base\n");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@ -1013,7 +1031,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_supports_mmap()) {
        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
+    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
    printf("                          - distribute: spread execution evenly over all nodes\n");
    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
    printf("                          - numactl: use the CPU map provided by numactl\n");
    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
    if (llama_supports_gpu_offload()) {
@ -1097,45 +1118,101 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 }
 //
-// String parsing
+// String utils
 //
-std::string parse_samplers_input(std::string input) {
+std::vector<std::string> string_split(std::string input, char separator) {
-    std::string output = "";
+    std::vector<std::string> parts;
    size_t separator_pos = input.find(separator);
    while (separator_pos != std::string::npos) {
        std::string part = input.substr(0, separator_pos);
        parts.emplace_back(part);
        input = input.substr(separator_pos + 1);
        separator_pos = input.find(separator);
    }
    parts.emplace_back(input);
    return parts;
 }
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
        {"top_p",       llama_sampler_type::TOP_P},
        {"typical_p",   llama_sampler_type::TYPICAL_P},
        {"min_p",       llama_sampler_type::MIN_P},
        {"tfs_z",       llama_sampler_type::TFS_Z},
        {"temperature", llama_sampler_type::TEMPERATURE}
    };
    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, char> samplers_symbols {
+    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top_k",      'k'},
+        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-k",      'k'},
+        {"top-p",       llama_sampler_type::TOP_P},
-        {"top_p",      'p'},
+        {"nucleus",     llama_sampler_type::TOP_P},
-        {"top-p",      'p'},
+        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"nucleus",    'p'},
+        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"typical_p",  'y'},
+        {"min-p",       llama_sampler_type::MIN_P},
-        {"typical-p",  'y'},
+        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"typical",    'y'},
+        {"tfs",         llama_sampler_type::TFS_Z},
-        {"min_p",      'm'},
+        {"temp",        llama_sampler_type::TEMPERATURE}
        {"min-p",      'm'},
        {"tfs_z",      'f'},
        {"tfs-z",      'f'},
        {"tfs",        'f'},
        {"temp",       't'},
        {"temperature",'t'}
    };
    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
    size_t separator = input.find(';');
    while (separator != input.npos) {
        std::string name = input.substr(0,separator);
        input = input.substr(separator+1);
        separator = input.find(';');
-        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+    std::vector<llama_sampler_type> sampler_types;
-            output += samplers_symbols[name];
+    sampler_types.reserve(names.size());
    for (const auto & name : names)
    {
        auto sampler_item = sampler_canonical_name_map.find(name);
        if (sampler_item != sampler_canonical_name_map.end())
        {
            sampler_types.push_back(sampler_item->second);
        }
        else
        {
            if (allow_alt_names)
            {
                sampler_item = sampler_alt_name_map.find(name);
                if (sampler_item != sampler_alt_name_map.end())
                {
                    sampler_types.push_back(sampler_item->second);
                }
            }
        }
    }
-    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+    return sampler_types;
-        output += samplers_symbols[input];
+}
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
    std::unordered_map<char, llama_sampler_type> sampler_name_map {
        {'k', llama_sampler_type::TOP_K},
        {'p', llama_sampler_type::TOP_P},
        {'y', llama_sampler_type::TYPICAL_P},
        {'m', llama_sampler_type::MIN_P},
        {'f', llama_sampler_type::TFS_Z},
        {'t', llama_sampler_type::TEMPERATURE}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names_string.size());
    for (const auto & c : names_string) {
        const auto sampler_item = sampler_name_map.find(c);
        if (sampler_item != sampler_name_map.end()) {
            sampler_types.push_back(sampler_item->second);
        }
    }
    return sampler_types;
 }
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
    switch (sampler_type) {
        case llama_sampler_type::TOP_K:       return "top_k";
        case llama_sampler_type::TFS_Z:       return "tfs_z";
        case llama_sampler_type::TYPICAL_P:   return "typical_p";
        case llama_sampler_type::TOP_P:       return "top_p";
        case llama_sampler_type::MIN_P:       return "min_p";
        case llama_sampler_type::TEMPERATURE: return "temperature";
        default : return "";
    }
    return output;
 }
 //
@ -1550,6 +1627,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
@ -1626,6 +1704,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@ -1639,7 +1718,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
@ -1664,7 +1742,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
+    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
@ -1673,7 +1751,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
@ -1724,7 +1802,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
            if (cs_curr[j] < 0) { continue; }
            if (seqs.find(cs_curr[j]) == seqs.end()) {
                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
+                const size_t sz = seqs.size();
                seqs[cs_curr[j]] = sz;
            }
        }
        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
--- a/common/common.h
+++ b/common/common.h
@ -61,7 +61,7 @@ struct gpt_params {
    float   p_split               = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
+    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
@ -75,7 +75,8 @@ struct gpt_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
-    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;
    // // sampling parameters
    struct llama_sampling_params sparams;
@ -134,7 +135,6 @@ struct gpt_params {
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
    bool infill            = false; // use infill mode
@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 void process_escapes(std::string& input);
 //
-// String parsing
+// String utils
 //
-std::string parse_samplers_input(std::string input);
+std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 //
 // Model utils
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
 std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
-        for (auto s : params.samplers_sequence) {
+        for (auto sampler_type : params.samplers_sequence) {
-            switch (s) {
+            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
-                case 'k': result += "-> top_k "; break;
+            if (!sampler_type_name.empty()) {
-                case 'f': result += "-> tfs_z "; break;
+                result += "-> " + sampler_type_name + " ";
                case 'y': result += "-> typical_p "; break;
                case 'p': result += "-> top_p "; break;
                case 'm': result += "-> min_p "; break;
                case 't': result += "-> temp "; break;
                default : break;
            }
        }
    } else {
@ -126,9 +121,7 @@ static void sampler_queue(
                   struct llama_context * ctx_main,
            const llama_sampling_params & params,
                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
+                                 size_t   min_keep) {
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
@ -137,16 +130,16 @@ static void sampler_queue(
    const float         min_p             = params.min_p;
    const float         tfs_z             = params.tfs_z;
    const float         typical_p         = params.typical_p;
-    const std::string & samplers_sequence = params.samplers_sequence;
+    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
-    for (auto s : samplers_sequence) {
+    for (auto sampler_type : samplers_sequence) {
-        switch (s){
+        switch (sampler_type) {
-            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't':
+            case llama_sampler_type::TEMPERATURE:
                if (dynatemp_range > 0) {
                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
@ -256,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
            // temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
+            size_t min_keep = std::max(1, params.min_keep);
            sampler_queue(ctx_main, params, cur_p, min_keep);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -8,10 +8,21 @@
 #include <vector>
 #include <unordered_map>
 // sampler types
 enum class llama_sampler_type : char {
    TOP_K       = 'k',
    TOP_P       = 'p',
    MIN_P       = 'm',
    TFS_Z       = 'f',
    TYPICAL_P   = 'y',
    TEMPERATURE = 't'
 };
 // sampling parameters
 typedef struct llama_sampling_params {
    int32_t     n_prev                = 64;       // number of previous tokens to remember
    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
    int32_t     top_k                 = 40;       // <= 0 to use vocab size
    float       top_p                 = 0.95f;    // 1.0 = disabled
    float       min_p                 = 0.05f;    // 0.0 = disabled
@ -28,7 +39,15 @@ typedef struct llama_sampling_params {
    float       mirostat_tau          = 5.00f;    // target entropy
    float       mirostat_eta          = 0.10f;    // learning rate
    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
+
    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
        llama_sampler_type::TFS_Z,
        llama_sampler_type::TYPICAL_P,
        llama_sampler_type::TOP_P,
        llama_sampler_type::MIN_P,
        llama_sampler_type::TEMPERATURE
    };
    std::string grammar;  // optional BNF-like grammar to constrain sampling
--- a/common/train.cpp
+++ b/common/train.cpp
@ -31,7 +31,7 @@ struct train_state  * init_train_state() {
    state->opt = new struct ggml_opt_context;
    state->opt->ctx = NULL;
-    state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
    state->opt->loss_after = 0.0f;
@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
    std::string opt_type;
    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_ADAM;
+        opt->params.type = GGML_OPT_TYPE_ADAM;
        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_LBFGS;
+        opt->params.type = GGML_OPT_TYPE_LBFGS;
        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
+        case GGML_OPT_TYPE_ADAM:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                    gguf_add_tensor(fctx, opt->adam.pf);
                }
            } break;
-        case GGML_OPT_LBFGS:
+        case GGML_OPT_TYPE_LBFGS:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
 import numpy as np
 import torch
@ -25,15 +25,6 @@ import gguf
 from convert import HfVocab
 # check for any of the given keys in the dictionary and return the value of the first key found
 def get_key_opts(d, keys):
    for k in keys:
        if k in d:
            return d[k]
    print(f"Could not find any of {keys}")
    sys.exit()
 ###### MODEL DEFINITIONS ######
 class SentencePieceTokenTypes(IntEnum):
@ -58,6 +49,15 @@ class Model:
        self.hparams = Model.load_hparams(self.dir_model)
        self.model_arch = self._get_model_architecture()
        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
    def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
        key = next((k for k in keys if k in self.hparams), None)
        if key is not None:
            return self.hparams[key]
        if optional:
            return None
        raise KeyError(f"could not find any of: {keys}")
    def set_vocab(self):
        self._set_vocab_gpt2()
@ -79,28 +79,33 @@ class Model:
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_block_count(self.hparams.get(
+        self.gguf_writer.add_block_count(self.block_count)
-            "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
+
-        ))
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
        if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
            self.gguf_writer.add_context_length(n_ctx)
-        if (n_embd := self.hparams.get("hidden_size")) is not None:
+
-            self.gguf_writer.add_embedding_length(n_embd)
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        if (n_ff := self.hparams.get("intermediate_size")) is not None:
+        self.gguf_writer.add_embedding_length(n_embd)
        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-        if (n_head := self.hparams.get("num_attention_heads")) is not None:
+
-            self.gguf_writer.add_head_count(n_head)
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_head_count(n_head)
        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
-        if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
-            self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
+            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
-        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
+        self.gguf_writer.add_file_type(self.ftype)
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -187,7 +192,7 @@ class Model:
            return RefactModel
        if model_architecture == "PersimmonForCausalLM":
            return PersimmonModel
-        if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+        if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
            return StableLMModel
        if model_architecture == "QWenLMHeadModel":
            return QwenModel
@ -209,6 +214,12 @@ class Model:
            return InternLM2Model
        if model_architecture == "MiniCPMForCausalLM":
            return MiniCPMModel
        if model_architecture == "BertModel":
            return BertModel
        if model_architecture == "NomicBertModel":
            return NomicBertModel
        if model_architecture == "GemmaForCausalLM":
            return GemmaModel
        return Model
    def _is_model_safetensors(self) -> bool:
@ -242,7 +253,7 @@ class Model:
            return gguf.MODEL_ARCH.REFACT
        if arch == "PersimmonForCausalLM":
            return gguf.MODEL_ARCH.PERSIMMON
-        if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+        if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
            return gguf.MODEL_ARCH.STABLELM
        if arch == "QWenLMHeadModel":
            return gguf.MODEL_ARCH.QWEN
@ -264,6 +275,12 @@ class Model:
            return gguf.MODEL_ARCH.INTERNLM2
        if arch == "MiniCPMForCausalLM":
            return gguf.MODEL_ARCH.MINICPM
        if arch == "BertModel":
            return gguf.MODEL_ARCH.BERT
        if arch == "NomicBertModel":
            return gguf.MODEL_ARCH.NOMIC_BERT
        if arch == "GemmaForCausalLM":
            return gguf.MODEL_ARCH.GEMMA
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -605,11 +622,6 @@ class MPTModel(Model):
            self.gguf_writer.add_tensor(new_name, data)
            # note: MPT output is tied to (same as) wte in original model;
            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
            if new_name == "token_embd.weight":
                self.gguf_writer.add_tensor("output.weight", data)
 class OrionModel(Model):
    def set_vocab(self):
@ -642,6 +654,8 @@ class OrionModel(Model):
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        # note: config provides rms norm but it is actually layer norm
        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
    def write_tensors(self):
@ -1018,7 +1032,6 @@ class PersimmonModel(Model):
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
    def set_vocab(self):
        self._set_vocab_sentencepiece()
@ -1061,10 +1074,11 @@ class StableLMModel(Model):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
+        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
-        self.gguf_writer.add_layer_norm_eps(1e-5)
+        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
 class MixtralModel(Model):
@ -1293,21 +1307,21 @@ class GPT2Model(Model):
 class Phi2Model(Model):
    def set_gguf_parameters(self):
-        block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
-        rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
+        rot_pct = self.find_hparam(["partial_rotary_factor"])
-        n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_name("Phi2")
-        self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
+        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
        self.gguf_writer.add_embedding_length(n_embd)
        self.gguf_writer.add_feed_forward_length(4 * n_embd)
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head)
-        self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
+        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_add_bos_token(False)
@ -1629,6 +1643,206 @@ in chat mode so that the conversation can end normally.")
                self.post_write_tensors(tensor_map, name, data_torch)
 class BertModel(Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.vocab_size = None
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_causal_attention(False)
        # get pooling path
        with open(self.dir_model / "modules.json", encoding="utf-8") as f:
            modules = json.load(f)
        pooling_path = None
        for mod in modules:
            if mod["type"] == "sentence_transformers.models.Pooling":
                pooling_path = mod["path"]
                break
        # get pooling type
        pooling_type = gguf.PoolingType.NONE
        if pooling_path is not None:
            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
                pooling = json.load(f)
            if pooling["pooling_mode_mean_tokens"]:
                pooling_type = gguf.PoolingType.MEAN
            elif pooling["pooling_mode_cls_token"]:
                pooling_type = gguf.PoolingType.CLS
            else:
                raise NotImplementedError("Only MEAN and CLS pooling types supported")
        self.gguf_writer.add_pooling_type(pooling_type.value)
    def set_vocab(self):
        path = self.dir_model
        added_tokens_path = self.dir_model if self.dir_model.exists() else None
        # use huggingface vocab to get all tokens
        vocab = HfVocab(path, added_tokens_path)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        self.vocab_size = vocab.vocab_size
        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
        n_token_types = len(set(toktypes))
        self.gguf_writer.add_token_type_count(n_token_types)
        # convert to phantom space vocab
        def phantom(tok, typ):
            if tok.startswith(b"[") and tok.endswith(b"]"):
                return tok
            if tok.startswith(b"##"):
                return tok[2:]
            return b"\xe2\x96\x81" + tok
        tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
        # set up bos and eos tokens (cls and sep)
        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        # handle special tokens
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def write_tensors(self):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        tensors = dict(self.get_tensors())
        for name, data_torch in tensors.items():
            # we are only using BERT for embeddings so we don't need the pooling layer
            if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
                continue  # we don't need these
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            data = data_torch.squeeze().numpy()
            n_dims = len(data.shape)
            new_dtype: type[np.floating[Any]]
            if (
                self.ftype == 1 and name.endswith(".weight") and n_dims == 2
                and name != "embeddings.token_type_embeddings.weight"  # not used with get_rows, must be F32
            ):
                # if f16 desired, convert any float32 2-dim weight tensors to float16
                new_dtype = np.float16
            else:
                # if f32 desired, convert any float16 to float32
                new_dtype = np.float32
            print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
            if data.dtype != new_dtype:
                data = data.astype(new_dtype)
            self.gguf_writer.add_tensor(new_name, data)
 class NomicBertModel(BertModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # the HF config claims n_ctx=8192, but it uses RoPE scaling
        self.hparams["n_ctx"] = 2048
        # SwigLU activation
        assert self.hparams["activation_function"] == "swiglu"
        # this doesn't do anything in the HF version
        assert self.hparams["causal"] is False
        # no bias tensors
        assert self.hparams["qkv_proj_bias"] is False
        assert self.hparams["mlp_fc1_bias"] is False
        assert self.hparams["mlp_fc2_bias"] is False
        # norm at end of layer
        assert self.hparams["prenorm"] is False
        # standard RoPE
        assert self.hparams["rotary_emb_fraction"] == 1.0
        assert self.hparams["rotary_emb_interleaved"] is False
        assert self.hparams["rotary_emb_scale_base"] is None
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
    def get_tensors(self):
        assert self.vocab_size is not None
        for name, data in super().get_tensors():
            # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
            if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
                rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
                assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
                data = data[:self.vocab_size, :]
            yield name, data
 class GemmaModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        for name, data_torch in self.get_tensors():
            # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
            if name.endswith("norm.weight"):
                data_torch = data_torch + 1
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 ###### CONVERSION LOGIC ######
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -88,7 +88,8 @@ def main():
    gguf_writer.add_embedding_length(hidden_size)
    gguf_writer.add_block_count(block_count)
    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
-    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
+    # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
    gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
    gguf_writer.add_head_count(head_count)
    gguf_writer.add_head_count_kv(head_count_kv)
    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
--- a/convert.py
+++ b/convert.py
@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
            for (name, tensor) in model.items()}
-def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@ -1199,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
    for name, lazy_tensor in model.items():
        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
        if name_new is None:
-            raise Exception(f"Unexpected tensor name: {name}")
+            if skip_unknown:
                print(f"Unexpected tensor name: {name} - skipping")
                continue
            else:
                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
@ -1377,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None:
        output_choices.append("q8_0")
    vocab_types = ["spm", "bpe", "hfft"]
    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
+    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
-    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
-    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
-    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
-    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
-    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
-    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
+    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    args = parser.parse_args(args_in)
    if args.awq_path:
@ -1461,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
    print(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
-    model   = convert_model_names(model, params)
+    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -38,6 +38,7 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(gguf)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(imatrix)
    if (LLAMA_BUILD_SERVER)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1533,27 +1533,28 @@ int main(int argc, char ** argv) {
        int n_past = 0;
-        ggml_cgraph gf = {};
+        struct ggml_cgraph * gf = NULL;
        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_before_opt = ggml_get_f32_1d(e, 0);
-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
        opt_params_lbfgs.lbfgs.n_iter = 16;
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
            };
            struct ggml_context * ctx0 = ggml_init(params);
-            ggml_cgraph gf = {};
+            struct ggml_cgraph * gf = NULL;
            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-            ggml_build_forward_expand(&gf, logits);
+            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -82,7 +82,8 @@ int main(int argc, char ** argv) {
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
@ -158,7 +159,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
 let n_len: Int = 32
 // init LLM
-llama_backend_init(false)
+llama_backend_init()
 defer {
    llama_backend_free()
 }
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
@ -91,7 +92,7 @@ int main(int argc, char ** argv) {
    const int n_ctx    = llama_n_ctx(ctx);
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -119,7 +119,8 @@ int main(int argc, char ** argv)
    // Init LLM :
    //---------------------------------
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -325,14 +325,14 @@ struct train_params {
 };
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 static void init_model(struct my_llama_model * model) {
@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -7,6 +7,51 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static std::vector<std::string> split_lines(const std::string & s) {
    std::string line;
    std::vector<std::string> lines;
    std::stringstream ss(s);
    while (std::getline(ss, line)) {
        lines.push_back(line);
    }
    return lines;
 }
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
    for (size_t i = 0; i < tokens.size(); i++) {
        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
    }
 }
 static void normalize(float * vec, float * out, int n) {
    float norm = 0;
    for (int i = 0; i < n; i++) {
        norm += vec[i] * vec[i];
    }
    norm = sqrt(norm);
    for (int i = 0; i < n; i++) {
        out[i] = vec[i] / norm;
    }
 }
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_cache_clear(ctx);
    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_decode(ctx, batch) < 0) {
        fprintf(stderr, "%s : failed to decode\n", __func__);
    }
    // normalize on copy
    for (int k = 0; k < n_seq; k++) {
        float * emb = llama_get_embeddings_ith(ctx, k);
        float * out = output + k * n_embd;
        normalize(emb, out, n_embd);
    }
 }
 int main(int argc, char ** argv) {
    gpt_params params;
@ -29,7 +74,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
@ -55,49 +101,84 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
-    int n_past = 0;
+    // split the prompt into lines
    std::vector<std::string> prompts = split_lines(params.prompt);
-    // tokenize the prompt
+    // max batch size
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    const uint64_t n_batch = params.n_batch;
    GGML_ASSERT(params.n_batch == params.n_ctx);
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
        auto inp = ::llama_tokenize(ctx, prompt, true);
        if (inp.size() > n_batch) {
            inp.resize(n_batch);
        }
        inputs.push_back(inp);
    }
    // tokenization stats
    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
+        for (int i = 0; i < (int) inputs.size(); i++) {
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            for (int j = 0; j < (int) inputs[i].size(); j++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
            }
            fprintf(stderr, "\n\n");
        }
        fprintf(stderr, "\n");
    }
-    if (embd_inp.size() > (size_t)n_ctx) {
+    // initialize batch
-        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+    const int n_prompts = prompts.size();
-                __func__, embd_inp.size(), n_ctx);
+    struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);
        return 1;
    }
    while (!embd_inp.empty()) {
        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        n_past += n_tokens;
        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
    }
    // allocate output
    const int n_embd = llama_n_embd(model);
-    const auto * embeddings = llama_get_embeddings(ctx);
+    std::vector<float> embeddings(n_prompts * n_embd, 0);
    float * emb = embeddings.data();
-    for (int i = 0; i < n_embd; i++) {
+    // break into batches
-        printf("%f ", embeddings[i]);
+    int p = 0; // number of prompts processed already
    int s = 0; // number of prompts in current batch
    for (int k = 0; k < n_prompts; k++) {
        // clamp to n_batch tokens
        auto & inp = inputs[k];
        const uint64_t n_toks = inp.size();
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
            batch_decode(ctx, batch, out, s, n_embd);
            llama_batch_clear(batch);
            p += s;
            s = 0;
        }
        // add to batch
        batch_add_seq(batch, inp, s);
        s += 1;
    }
    printf("\n");
    // final batch
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);
    // print first 3 embeddings
    for (int j = 0; j < std::min(3, n_prompts); j++) {
        fprintf(stderr, "embedding %d: ", j);
        for (int i = 0; i < n_embd; i++) {
            fprintf(stderr, "%f ", emb[j * n_embd + i]);
        }
        fprintf(stderr, "\n\n");
    }
    fprintf(stderr, "\n");
    // clean up
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    return 0;
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -7,8 +7,6 @@
 #include <string>
 #include <thread>
 static const size_t tensor_alignment = 32;
 struct lora_info {
    std::string filename;
    float scale;
@ -337,24 +335,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
    params.mem_buffer = NULL;
    params.no_alloc   = true;
    struct ggml_context * ctx = NULL;
-    struct ggml_allocr * alloc = NULL;
+    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph * gf = NULL;
+    struct ggml_cgraph  * gf = NULL;
    ctx   = ggml_init(params);
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    ggml_free(ctx);
-    static std::vector<uint8_t> data_compute;
+    ggml_gallocr_alloc_graph(alloc, gf);
    data_compute.resize(alloc_size + tensor_alignment);
    ctx   = ggml_init(params);
    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
    static std::vector<uint8_t> data_work;
@ -363,6 +351,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
    ggml_graph_compute(gf, &cplan);
    ggml_gallocr_free(alloc);
    ggml_free(ctx);
    return true;
 }
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
  --rank-wk N                LORA rank for wk tensor (default 4)
  --rank-wv N                LORA rank for wv tensor (default 4)
  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-w1 N                LORA rank for w1 tensor (default 4)
+  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-w2 N                LORA rank for w2 tensor (default 4)
+  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-w3 N                LORA rank for w3 tensor (default 4)
+  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
 ```
 The LORA rank of 'norm' tensors should always be 1.
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "llama.h"
 #include "common.h"
 #include "train.h"
@ -13,8 +14,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static const size_t tensor_alignment = 32;
 struct my_llama_hparams {
    uint32_t n_vocab    = 32000;
    uint32_t n_ctx      = 512;
@ -61,9 +60,9 @@ struct my_llama_layer {
    struct ggml_tensor * ffn_norm;
    // ff
-    struct ggml_tensor * w1;
+    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * w2;
+    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * w3;
+    struct ggml_tensor * ffn_up;   // w3
 };
 struct my_llama_model {
@ -86,9 +85,9 @@ struct my_llama_lora_hparams {
    uint32_t n_rank_wv = 4;
    uint32_t n_rank_wo = 4;
    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_w1 = 4;
+    uint32_t n_rank_ffn_gate = 4;
-    uint32_t n_rank_w2 = 4;
+    uint32_t n_rank_ffn_down = 4;
-    uint32_t n_rank_w3 = 4;
+    uint32_t n_rank_ffn_up = 4;
    uint32_t n_rank_tok_embeddings = 4;
    uint32_t n_rank_norm = 1;
    uint32_t n_rank_output = 4;
@ -118,17 +117,17 @@ struct my_llama_lora_layer {
    struct ggml_tensor * ffn_norm_b;
    // ff
-    struct ggml_tensor * w1_a;
+    struct ggml_tensor * ffn_gate_a;
-    struct ggml_tensor * w1_b;
+    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * w2_a;
+    struct ggml_tensor * ffn_down_a;
-    struct ggml_tensor * w2_b;
+    struct ggml_tensor * ffn_down_b;
-    struct ggml_tensor * w3_a;
+    struct ggml_tensor * ffn_up_a;
-    struct ggml_tensor * w3_b;
+    struct ggml_tensor * ffn_up_b;
 };
 struct my_llama_lora {
    struct ggml_context * ctx = NULL;
-    std::vector<uint8_t> data;
+    ggml_backend_buffer_t data;
    my_llama_lora_hparams hparams;
@ -209,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_w1             : %u\n", __func__, params->n_rank_w1);
+    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
-    printf("%s: n_rank_w2             : %u\n", __func__, params->n_rank_w2);
+    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
-    printf("%s: n_rank_w3             : %u\n", __func__, params->n_rank_w3);
+    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
@ -320,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.w1             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
+        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.w2             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
+        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.w3             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
+        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
        assert_shape_1d(layer.attention_norm, hparams.n_embd);
        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
@ -330,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.w2,             hparams.n_ff,   hparams.n_embd);
+        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.w3,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
    }
 }
@ -363,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
        ggml_set_param(ctx, layer.wo_b);
        ggml_set_param(ctx, layer.ffn_norm_a);
        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.w1_a);
+        ggml_set_param(ctx, layer.ffn_gate_a);
-        ggml_set_param(ctx, layer.w1_b);
+        ggml_set_param(ctx, layer.ffn_gate_b);
-        ggml_set_param(ctx, layer.w2_a);
+        ggml_set_param(ctx, layer.ffn_down_a);
-        ggml_set_param(ctx, layer.w2_b);
+        ggml_set_param(ctx, layer.ffn_down_b);
-        ggml_set_param(ctx, layer.w3_a);
+        ggml_set_param(ctx, layer.ffn_up_a);
-        ggml_set_param(ctx, layer.w3_b);
+        ggml_set_param(ctx, layer.ffn_up_b);
    }
 }
 static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
    ggml_allocr_alloc(alloc, lora->norm_a);
    ggml_allocr_alloc(alloc, lora->norm_b);
    ggml_allocr_alloc(alloc, lora->output_a);
    ggml_allocr_alloc(alloc, lora->output_b);
    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
        auto & layer = lora->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm_a);
        ggml_allocr_alloc(alloc, layer.attention_norm_b);
        ggml_allocr_alloc(alloc, layer.wq_a);
        ggml_allocr_alloc(alloc, layer.wq_b);
        ggml_allocr_alloc(alloc, layer.wk_a);
        ggml_allocr_alloc(alloc, layer.wk_b);
        ggml_allocr_alloc(alloc, layer.wv_a);
        ggml_allocr_alloc(alloc, layer.wv_b);
        ggml_allocr_alloc(alloc, layer.wo_a);
        ggml_allocr_alloc(alloc, layer.wo_b);
        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
        ggml_allocr_alloc(alloc, layer.w1_a);
        ggml_allocr_alloc(alloc, layer.w1_b);
        ggml_allocr_alloc(alloc, layer.w2_a);
        ggml_allocr_alloc(alloc, layer.w2_b);
        ggml_allocr_alloc(alloc, layer.w3_a);
        ggml_allocr_alloc(alloc, layer.w3_b);
    }
    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
    ggml_allocr_alloc(alloc, lora->norm_a->grad);
    ggml_allocr_alloc(alloc, lora->norm_b->grad);
    ggml_allocr_alloc(alloc, lora->output_a->grad);
    ggml_allocr_alloc(alloc, lora->output_b->grad);
    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
        auto & layer = lora->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
        ggml_allocr_alloc(alloc, layer.wq_a->grad);
        ggml_allocr_alloc(alloc, layer.wq_b->grad);
        ggml_allocr_alloc(alloc, layer.wk_a->grad);
        ggml_allocr_alloc(alloc, layer.wk_b->grad);
        ggml_allocr_alloc(alloc, layer.wv_a->grad);
        ggml_allocr_alloc(alloc, layer.wv_b->grad);
        ggml_allocr_alloc(alloc, layer.wo_a->grad);
        ggml_allocr_alloc(alloc, layer.wo_b->grad);
        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
        ggml_allocr_alloc(alloc, layer.w1_a->grad);
        ggml_allocr_alloc(alloc, layer.w1_b->grad);
        ggml_allocr_alloc(alloc, layer.w2_a->grad);
        ggml_allocr_alloc(alloc, layer.w2_b->grad);
        ggml_allocr_alloc(alloc, layer.w3_a->grad);
        ggml_allocr_alloc(alloc, layer.w3_b->grad);
    }
 }
@ -493,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
-        layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
+        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
-        layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
+        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
-        layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
+        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
-        layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
+        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
-        layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
+        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
-        layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
+        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);
        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
@ -512,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
    }
    set_param_lora(lora);
-    // measure data size
+    // allocate data for lora tensors
-    size_t size = 0;
+    lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
    }
    // allocate data
    struct ggml_allocr * alloc = NULL;
    lora->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
    alloc_lora(alloc, lora);
    ggml_allocr_free(alloc);
 }
 static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@ -565,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
        randomize_tensor_normal(layer.ffn_norm_a, rnd);
        ggml_set_zero(layer.ffn_norm_b);
-        randomize_tensor_normal(layer.w1_a, rnd);
+        randomize_tensor_normal(layer.ffn_gate_a, rnd);
-        ggml_set_zero(layer.w1_b);
+        ggml_set_zero(layer.ffn_gate_b);
-        randomize_tensor_normal(layer.w2_a, rnd);
+        randomize_tensor_normal(layer.ffn_down_a, rnd);
-        ggml_set_zero(layer.w2_b);
+        ggml_set_zero(layer.ffn_down_b);
-        randomize_tensor_normal(layer.w3_a, rnd);
+        randomize_tensor_normal(layer.ffn_up_a, rnd);
-        ggml_set_zero(layer.w3_b);
+        ggml_set_zero(layer.ffn_up_b);
    }
    free_random_normal_distribution(rnd);
@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
 static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct my_llama_model * model,
        struct my_llama_lora  * lora,
-        struct ggml_allocr    * alloc,
+        ggml_gallocr_t          alloc,
        struct ggml_context   * ctx,
        struct ggml_cgraph    * gf,
        struct ggml_cgraph    * gb,
@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const  int              n_tokens,
        const  int              n_batch,
        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing) {
+        const  bool             enable_checkpointing,
        const  bool             measure_only) {
    ggml_set_scratch(ctx, { 0, 0, nullptr, });
    const int n_past = 0;
@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(alloc, KQ_pos);
+    ggml_set_input(KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
        }
    }
    // rope has so much parameters that we make a custom function for it
    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -683,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
+        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
+        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
-        struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
+        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
-        struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
@ -732,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                 set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                 set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                 set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
        cur = t30;
        if (enable_checkpointing) {
@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // input gradient
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-    ggml_allocr_alloc(alloc, t36->grad);
+    ggml_set_input(t36->grad);
    // KQ_pos
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
@ -796,20 +723,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
    }
    // allocating checkpoints in one block to reduce memory fragmentation
    // note: they will be freed in reverse order
    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_allocr_alloc(alloc, checkpoints[i]);
+            ggml_set_input(checkpoints[i]);
        }
    }
-    ggml_allocr_alloc_graph(alloc, gb);
+    if (measure_only) {
        ggml_gallocr_reserve(alloc, gb);
    } else {
        ggml_gallocr_alloc_graph(alloc, gb);
        // set KQ_pos
        {
            int * data = (int *) KQ_pos->data;
            for (int i = 0; i < N; ++i) {
                data[i] = n_past + i;
            }
        }
    }
    // remove the additional nodes and leafs
    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -859,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
    init_lora(model, lora);
@ -886,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
+        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
-        copy_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
+        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
-        copy_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
+        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
-        copy_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
+        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
-        copy_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
+        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
-        copy_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
+        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
    }
 }
@ -929,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_w1);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_w2);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_w3);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);
    gguf_add_tensor(fctx, lora->tok_embeddings_a);
    gguf_add_tensor(fctx, lora->tok_embeddings_b);
@ -955,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
        gguf_add_tensor(fctx, layer.wo_b);
        gguf_add_tensor(fctx, layer.ffn_norm_a);
        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.w1_a);
+        gguf_add_tensor(fctx, layer.ffn_gate_a);
-        gguf_add_tensor(fctx, layer.w1_b);
+        gguf_add_tensor(fctx, layer.ffn_gate_b);
-        gguf_add_tensor(fctx, layer.w2_a);
+        gguf_add_tensor(fctx, layer.ffn_down_a);
-        gguf_add_tensor(fctx, layer.w2_b);
+        gguf_add_tensor(fctx, layer.ffn_down_b);
-        gguf_add_tensor(fctx, layer.w3_a);
+        gguf_add_tensor(fctx, layer.ffn_up_a);
-        gguf_add_tensor(fctx, layer.w3_b);
+        gguf_add_tensor(fctx, layer.ffn_up_b);
    }
 }
@ -1165,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
    }
 }
@ -1200,9 +1139,9 @@ struct train_params {
    uint32_t n_rank_wv;
    uint32_t n_rank_wo;
    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_w1;
+    uint32_t n_rank_ffn_gate;
-    uint32_t n_rank_w2;
+    uint32_t n_rank_ffn_down;
-    uint32_t n_rank_w3;
+    uint32_t n_rank_ffn_up;
    uint32_t n_rank_tok_embeddings;
    uint32_t n_rank_norm;
    uint32_t n_rank_output;
@ -1213,9 +1152,9 @@ struct train_params {
    bool custom_n_rank_wv;
    bool custom_n_rank_wo;
    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_w1;
+    bool custom_n_rank_ffn_gate;
-    bool custom_n_rank_w2;
+    bool custom_n_rank_ffn_down;
-    bool custom_n_rank_w3;
+    bool custom_n_rank_ffn_up;
    bool custom_n_rank_tok_embeddings;
    bool custom_n_rank_norm;
    bool custom_n_rank_output;
@ -1247,9 +1186,9 @@ static struct train_params get_default_train_params() {
    params.n_rank_wv             = 4;
    params.n_rank_wo             = 4;
    params.n_rank_ffn_norm       = 1;
-    params.n_rank_w1             = 4;
+    params.n_rank_ffn_gate       = 4;
-    params.n_rank_w2             = 4;
+    params.n_rank_ffn_down       = 4;
-    params.n_rank_w3             = 4;
+    params.n_rank_ffn_up         = 4;
    params.n_rank_tok_embeddings = 4;
    params.n_rank_norm           = 1;
    params.n_rank_output         = 4;
@ -1260,9 +1199,9 @@ static struct train_params get_default_train_params() {
    params.custom_n_rank_wv             = false;
    params.custom_n_rank_wo             = false;
    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_w1             = false;
+    params.custom_n_rank_ffn_gate       = false;
-    params.custom_n_rank_w2             = false;
+    params.custom_n_rank_ffn_down       = false;
-    params.custom_n_rank_w3             = false;
+    params.custom_n_rank_ffn_up         = false;
    params.custom_n_rank_tok_embeddings = false;
    params.custom_n_rank_norm           = false;
    params.custom_n_rank_output         = false;
@ -1293,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");
    print_common_train_usage(argc, argv, &params->common);
 }
@ -1430,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_wo = std::stoi(argv[i]);
            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-w1") {
+        } else if (arg == "--rank-ffn_gate") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w1 = std::stoi(argv[i]);
+            params->n_rank_ffn_gate = std::stoi(argv[i]);
-            params->custom_n_rank_w1 = true;
+            params->custom_n_rank_ffn_gate = true;
-        } else if (arg == "--rank-w2") {
+        } else if (arg == "--rank-ffn_down") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w2 = std::stoi(argv[i]);
+            params->n_rank_ffn_down = std::stoi(argv[i]);
-            params->custom_n_rank_w2 = true;
+            params->custom_n_rank_ffn_down = true;
-        } else if (arg == "--rank-w3") {
+        } else if (arg == "--rank-ffn_up") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w3 = std::stoi(argv[i]);
+            params->n_rank_ffn_up = std::stoi(argv[i]);
-            params->custom_n_rank_w3 = true;
+            params->custom_n_rank_ffn_up = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@ -1513,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
        nx += ggml_nelements(layer.wo_b);
        nx += ggml_nelements(layer.ffn_norm_a);
        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.w1_a);
+        nx += ggml_nelements(layer.ffn_gate_a);
-        nx += ggml_nelements(layer.w1_b);
+        nx += ggml_nelements(layer.ffn_gate_b);
-        nx += ggml_nelements(layer.w2_a);
+        nx += ggml_nelements(layer.ffn_down_a);
-        nx += ggml_nelements(layer.w2_b);
+        nx += ggml_nelements(layer.ffn_down_b);
-        nx += ggml_nelements(layer.w3_a);
+        nx += ggml_nelements(layer.ffn_up_a);
-        nx += ggml_nelements(layer.w3_b);
+        nx += ggml_nelements(layer.ffn_up_b);
    }
    return nx;
 }
@ -1572,9 +1511,9 @@ int main(int argc, char ** argv) {
    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_w1                 = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
+    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
-    uint32_t n_rank_w2                 = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
+    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
-    uint32_t n_rank_w3                 = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
@ -1584,15 +1523,15 @@ int main(int argc, char ** argv) {
    lora.hparams.n_rank_wv             = n_rank_wv;
    lora.hparams.n_rank_wo             = n_rank_wo;
    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_w1             = n_rank_w1;
+    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
-    lora.hparams.n_rank_w2             = n_rank_w2;
+    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
-    lora.hparams.n_rank_w3             = n_rank_w3;
+    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
    lora.hparams.n_rank_norm           = n_rank_norm;
    lora.hparams.n_rank_output         = n_rank_output;
    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
@ -1627,9 +1566,9 @@ int main(int argc, char ** argv) {
        || (lora.hparams.n_rank_wv             != n_rank_wv)
        || (lora.hparams.n_rank_wo             != n_rank_wo)
        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_w1             != n_rank_w1)
+        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
-        || (lora.hparams.n_rank_w2             != n_rank_w2)
+        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
-        || (lora.hparams.n_rank_w3             != n_rank_w3)
+        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
        || (lora.hparams.n_rank_norm           != n_rank_norm)
        || (lora.hparams.n_rank_output         != n_rank_output)
@ -1663,7 +1602,7 @@ int main(int argc, char ** argv) {
    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
+    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
    if (params.only_write_lora) {
        save_train_files_data save_data;
@ -1690,10 +1629,6 @@ int main(int argc, char ** argv) {
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;
    std::vector<uint8_t> mem_input_data;
    std::vector<uint8_t> mem_compute_data;
    // context for input tensors without their data
    struct ggml_init_params ctx_input_params = {
        ggml_tensor_overhead() * 2, // mem_size
@ -1706,17 +1641,11 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
    // measure required memory for input tensors
    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // allocate input tensors
-    mem_input_data.resize(max_input_size);
+    // measure required memory for input tensors
-    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    ggml_allocr_alloc(alloc_inps, tokens_input);
+    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    ggml_allocr_alloc(alloc_inps, target_probs);
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1743,7 +1672,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1756,14 +1685,15 @@ int main(int argc, char ** argv) {
            &logits, tokens_input, target_probs,
            n_tokens, n_batch,
            params.common.use_flash,
-            params.common.use_checkpointing
+            params.common.use_checkpointing,
            true
        );
-        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
        if (max_compute_size < best_compute_size) {
            best_compute_size = max_compute_size;
            best_order = gf->order;
        }
-        ggml_allocr_free(alloc);
+        ggml_gallocr_free(alloc);
        ggml_free(ctx_compute);
    }
    size_t max_compute_size = best_compute_size;
@ -1774,9 +1704,8 @@ int main(int argc, char ** argv) {
        "invalid");
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1789,11 +1718,9 @@ int main(int argc, char ** argv) {
        &logits, tokens_input, target_probs,
        n_tokens, n_batch,
        params.common.use_flash,
-        params.common.use_checkpointing
+        params.common.use_checkpointing,
        false
    );
    ggml_allocr_free(alloc);
    ggml_allocr_free(alloc_inps);
    // tokenize data
    std::vector<llama_token> train_tokens;
@ -1908,6 +1835,8 @@ int main(int argc, char ** argv) {
    ggml_free(ctx_work);
    ggml_free(ctx_compute);
    ggml_free(ctx_input);
    ggml_gallocr_free(alloc);
    int64_t t1 = ggml_time_ms();
    printf("%s: total training time: ", __func__);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -568,7 +568,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model_params mparams = llama_model_params_from_gpt_params(params);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -202,7 +202,8 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -87,7 +87,21 @@ class SchemaConverter:
        elif schema_type == 'array' and 'items' in schema:
            # TODO `prefixItems` keyword
            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            list_item_operator = f'("," space {item_rule_name})'
            successive_items = ""
            min_items = schema.get("minItems", 0)
            if min_items > 0:
               first_item = f"({item_rule_name})"
               successive_items = list_item_operator * (min_items - 1)
               min_items -= 1
            else:
               first_item = f"({item_rule_name})?"
            max_items = schema.get("maxItems")
            if max_items is not None and max_items > min_items:
                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
            else:
                successive_items += list_item_operator + "*"
            rule = f'"[" space {first_item} {successive_items} "]" space'
            return self._add_rule(rule_name, rule)
        else:
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
-        case LLAMA_SPLIT_NONE:  return "none";
+        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_LAYER: return "layer";
+        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_ROW:   return "row";
+        case LLAMA_SPLIT_MODE_ROW:   return "row";
        default: GGML_ASSERT(!"invalid split mode");
    }
 }
@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_LAYER},
+    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
    /* mul_mat_q     */ {true},
@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            for (const auto & m : p) {
                llama_split_mode mode;
                if (m == "none") {
-                    mode = LLAMA_SPLIT_NONE;
+                    mode = LLAMA_SPLIT_MODE_NONE;
                } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_LAYER;
+                    mode = LLAMA_SPLIT_MODE_LAYER;
                } else if (m == "row") {
-                    mode = LLAMA_SPLIT_ROW;
+                    mode = LLAMA_SPLIT_MODE_ROW;
                } else {
                    invalid_param = true;
                    break;
@ -1151,8 +1151,7 @@ int main(int argc, char ** argv) {
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
    }
-    bool numa = false;
+    llama_backend_init();
    llama_backend_init(numa);
    // initialize printer
    std::unique_ptr<printer> p;
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@ -274,8 +274,8 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
-    llama_backend_init(numa);
+    llama_backend_init();
 }
 extern "C"
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -51,7 +51,7 @@ actor LlamaContext {
    }
    static func create_context(path: String) throws -> LlamaContext {
-        llama_backend_init(false)
+        llama_backend_init()
        var model_params = llama_model_default_params()
 #if targetEnvironment(simulator)
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -1,10 +1,12 @@
 # LLaVA
-Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
+Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
 as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
 The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 models are available.
 For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
 After API is confirmed, more models will be supported / uploaded.
@ -18,10 +20,11 @@ After building, run: `./llava-cli` to see the usage. For example:
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
 **note**: For GPU offloading ensure to use the `-ngl` flag just like usual
-## Model conversion
+## LLaVA 1.5
- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
+- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
@ -50,13 +53,79 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py ../llava-v1.5-7b
+python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
 2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
 3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
 cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```
 4) Create the visual gguf model:
 ```console
 python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
 5) Then convert the model to gguf format:
 ```console
 python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 6) And finally we can run the llava-cli using the 1.6 model version:
 ```console
 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
 ## llava-cli templating and llava-1.6 prompting
 llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
 For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
 **For Mistral and using llava-cli binary:**
 Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
 The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
 **For the 34B this should work:**
 Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
 ## How to know if you are running in llava-1.5 or llava-1.6 mode
 When running llava-cli you will see a visual information right before the prompt is being processed:
 **Llava-1.5:**
 `encode_image_with_clip: image embedding created: 576 tokens`
 **Llava-1.6 (anything above 576):**
 `encode_image_with_clip: image embedding created: 2880 tokens`
 Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
 ## TODO
- [ ] Support non-CPU backend for the image encoding part.
+- [x] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -24,25 +24,7 @@ struct clip_ctx;
 extern "C" {
 #endif
-struct clip_vision_hparams {
+struct clip_ctx;
    int32_t image_size;
    int32_t patch_size;
    int32_t hidden_size;
    int32_t n_intermediate;
    int32_t projection_dim;
    int32_t n_head;
    int32_t n_layer;
    float eps;
 };
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
 CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
@ -54,18 +36,43 @@ struct clip_image_f32_batch {
    size_t size;
 };
 CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
-CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
+/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
 CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -71,25 +71,26 @@ def bytes_to_unicode():
    return dict(zip(bs, cs))
-ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
+ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
+ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 # with proper
 args = ap.parse_args()
@ -105,7 +106,7 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
    vocab = None
    tokens = None
 else:
@ -133,7 +134,7 @@ ftype = 1
 if args.use_f32:
    ftype = 0
-if args.clip_model_is_vision:
+if args.clip_model_is_vision or args.clip_model_is_openclip:
    model = CLIPVisionModel.from_pretrained(dir_model)
    processor = None
 else:
@ -202,6 +203,57 @@ if has_vision_encoder:
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                            #     /**
                            #      "image_grid_pinpoints": [
                            #         [
                            #         336,
                            #         672
                            #         ],
                            #         [
                            #         672,
                            #         336
                            #         ],
                            #         [
                            #         672,
                            #         672
                            #         ],
                            #         [
                            #         1008,
                            #         336
                            #         ],
                            #         [
                            #         336,
                            #         1008
                            #         ]
                            #     ],
                            #     Flattened:
                            #     [
                            #         336, 672,
                            #         672, 336,
                            #         672, 672,
                            #         1008, 336,
                            #         336, 1008
                            #     ]
                            #  *
                            #  */
    if "image_grid_pinpoints" in v_hparams:
        # flatten it
        image_grid_pinpoints = []
        for pinpoint in v_hparams["image_grid_pinpoints"]:
            for p in pinpoint:
                image_grid_pinpoints.append(p)
        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
    if "image_crop_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
    if "image_aspect_ratio" in v_hparams:
        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
    if "image_split_resolution" in v_hparams:
        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
    if "mm_patch_merge_type" in v_hparams:
        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
    if "mm_projector_type" in v_hparams:
        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -155,11 +155,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
        printf("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
        printf("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
        // llava-1.5 native mode
        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
        user_prompt = prompt + "\nASSISTANT:";
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }
    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
@ -171,13 +189,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    fprintf(stderr, "\n");
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-
+    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
        printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
        fflush(stdout);
    }
@ -196,7 +218,8 @@ static struct llava_context * llava_init(gpt_params * params) {
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-    llama_backend_init(params->numa);
+    llama_backend_init();
    llama_numa_init(params->numa);
    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@ -0,0 +1,155 @@
 import argparse
 import glob
 import os
 import torch
 from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
    return file_path.endswith('.safetensors')
 # Unified loading function
 def load_model(file_path):
    if is_safetensor_file(file_path):
        tensors = {}
        with safe_open(file_path, framework="pt", device="cpu") as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
                # output shape
                print(f"{key} : {tensors[key].shape}")
        return tensors, 'safetensor'
    else:
        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
 # Unified saving function
 def save_model(model, file_path, file_type):
    if file_type == 'safetensor':
        # safe_save(model, file_path)
        save_file(model, file_path)
    else:
        torch.save(model, file_path)
 # Adapted function to clean vision tower from checkpoint
 def clean_vision_tower_from_checkpoint(checkpoint_path):
    checkpoint, file_type = load_model(checkpoint_path)
    # file_type = 'pytorch'
    model_path = os.path.dirname(checkpoint_path)
    print(f"Searching for vision tower tensors in {checkpoint_path}")
    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
    if len(clip_tensors) > 0:
        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
        # Adapted for file type
        clip_path = os.path.join(model_path, "llava.clip")
        if os.path.exists(clip_path):
            print(f"Loading existing llava.clip from {clip_path}")
            existing_clip, _ = load_model(clip_path)
        else:
            print(f"Creating new llava.clip at {clip_path}")
            existing_clip = {}
        # Update existing_clip with new tensors, avoid duplicates
        for name in clip_tensors:
            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
            print(f"Adding {simple_name} to llava.clip")
            if simple_name not in existing_clip:
                existing_clip[simple_name] = checkpoint[name]
        # Save the updated clip tensors back to llava.clip
        save_model(existing_clip, clip_path, 'pytorch')
        # Remove the tensors from the original checkpoint
        for name in clip_tensors:
            del checkpoint[name]
        checkpoint_path = checkpoint_path
        return True
    return False
 def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
    newline_checkpoint_path = None
    projector_checkpoint_path = None
    for path in checkpoint_paths:
        checkpoint, _ = load_model(path)
        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
            newline_checkpoint_path = path
        if projector(checkpoint):
            projector_checkpoint_path = path
    return newline_checkpoint_path, projector_checkpoint_path
 def newline_criteria(checkpoint):
    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
 def proj_criteria(checkpoint):
    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
 # Command-line interface setup
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model")
 ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
 args = ap.parse_args()
 if args.clean_vision_tower:
    # Generalized to handle both PyTorch and SafeTensors models
    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
    for projector_checkpoint_path in checkpoint_paths:
        print(f"Cleaning {projector_checkpoint_path}")
        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
            print(f"No vision tower found in {projector_checkpoint_path}")
            # we break once none is found, so far all models append them at the end
            # break
    print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.")
 # Now we look for the projector in the last checkpoint
 model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
 checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
 # last_checkpoint_path = checkpoint_paths[0]
 # first_checkpoint_path = checkpoint_paths[-1]
 newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
 print(f"Taking projector from {projector_checkpoint_path}")
 first_mm_tensors = []
 first_checkpoint = None
 if newline_checkpoint_path is not None:
    print(f"Taking newline from {newline_checkpoint_path}")
    first_checkpoint, file_type = load_model(newline_checkpoint_path)
    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
 # Load the checkpoint
 mm_tensors = []
 last_checkpoint = None
 if projector_checkpoint_path is not None:
    last_checkpoint, file_type = load_model(projector_checkpoint_path)
    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
 if len(mm_tensors) == 0:
    if last_checkpoint is not None:
        for k, v in last_checkpoint.items():
            print(k)
    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()
 print(f"Found {len(mm_tensors)} tensors to extract.")
 print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    projector[name] = first_checkpoint[name].float()
 if len(projector) > 0:
    save_model(projector, f"{args.model}/llava.projector", 'pytorch')
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
 projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
    del checkpoint[name]
 # BakLLaVA models contain CLIP tensors in it
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
 if len(clip_tensors) > 0:
    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
    torch.save(clip, f"{args.model}/llava.clip")
    # remove these tensors
    for name in clip_tensors:
        del checkpoint[name]
    # added tokens should be removed to be able to convert Mistral models
    if os.path.exists(f"{args.model}/added_tokens.json"):
@ -39,7 +32,6 @@ if len(clip_tensors) > 0:
            f.write("{}\n")
 torch.save(checkpoint, path)
 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -2,32 +2,296 @@
 #include "common.h"
 #include "llama.h"
 #include "llava.h"
 #include "base64.hpp"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 #include <numeric>
 // RGB uint8 image
 struct clip_image_u8 {
    int nx;
    int ny;
    std::vector<uint8_t> buf;
 };
 // RGB float32 image (NHWC)
 // Memory layout: RGBRGBRGB...
 struct clip_image_f32 {
    int nx;
    int ny;
    std::vector<float> buf;
 };
 struct clip_image_grid_shape {
    int first;
    int second;
 };
 /**
 * Selects the best resolution from a list of possible resolutions based on the original size.
 *
 * @param original_size The original size of the image in the format (width, height).
 * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
 * @return The best fit resolution in the format (width, height).
 */
 static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
    int original_width  = original_size.first;
    int original_height = original_size.second;
    std::pair<int, int> best_fit;
    int max_effective_resolution = 0;
    int min_wasted_resolution = std::numeric_limits<int>::max();
    for (const auto& resolution : possible_resolutions) {
        int width = resolution.first;
        int height = resolution.second;
        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
        int downscaled_width  = static_cast<int>(original_width * scale);
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
            best_fit = resolution;
        }
    }
    return best_fit;
 }
 /**
 * @brief Get the anyres image grid shape object
 *
 * @param image_size
 * @param grid_pinpoints
 * @param image_patch_size
 * @return <int, int>
 */
 static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
    /**
        Conversion from gguf flat array to vector:
        std::vector<std::pair<int, int>> possible_resolutions;
        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
        }
     */
    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
 }
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
        struct ggml_tensor * newline;
        struct ggml_context * ctx;
    } model;
    const int32_t image_size = clip_image_size(ctx_clip);
    const int32_t patch_size = clip_patch_size(ctx_clip);
    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
    int num_patches_width  = grid_shape.first;  // grid 1-4
    int num_patches_height = grid_shape.second; // grid 1-4
    const size_t num_images = num_patches_width * num_patches_height + 1;
    // TODO: size calculation is not calculated - it's only tens of MB
    size_t ctx_size = 0;
    {
        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
    }
    struct ggml_init_params params {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
    };
    // Python reference code for full unpad:
    /*
        base_image_feature = image_feature[0]
        image_feature = image_feature[1:]
        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
        image_feature = unpad_image(image_feature, image_sizes[image_idx])
        image_feature = torch.cat((
            image_feature,
            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
        ), dim=-1)
        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
    */
    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
    // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
    // Once all images are processed to prepended the base_image_features without any changes.
    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
    /*
        image_feature = image_feature.view(2, 2, 24, 24, 4096)
        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
        image_feature = image_feature.view(2, 24, 2, 24, 4096)
        image_feature = image_feature.flatten(0, 3)
        // Reshape to 4D tensor by merging the last two dimensions
        image_feature = image_feature.view(2, 2, 24, 24*4096)
        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
        image_feature = image_feature.view(-1, 4096)
    */
    model.ctx = ggml_init(params);
    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
        if (newline_tmp->buffer == NULL) {
            printf("newline_tmp tensor buffer is NULL\n");
        }
        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
    } else {
        model.newline->data = newline_tmp->data;
        if (model.newline->data == NULL) {
            printf("newline_tmp tensor data is NULL\n");
        }
    }
    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
    for (size_t i = 1; i < num_images; i++) {
        size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
    }
    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
    size_t size_ele = ggml_type_size(GGML_TYPE_F32);
    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
                                                                num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
                                                                num_patches_per_side,
                                                                num_patches_width,
                                                                num_patches_height,
                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
    /**
     At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
         image_feature = torch.cat((
        image_feature,
        self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
    ), dim=-1)
     *
     */
    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
    // Debug: Test single segments
    // Current findings: sending base image, sending a segment embedding all works similar to python
    // However, permuted embeddings do not work yet (stride issue?)
    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
    // *n_img_pos_out=576;
    ggml_free(model.ctx);
    return true;
 }
 #include "base64.hpp"
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = clip_image_f32_init();
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
+    clip_image_f32_batch img_res_v;
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-        clip_image_f32_free(img_res);
+        delete[] img_res_v.data;
        return false;
    }
    *n_img_pos = clip_n_patches(ctx_clip);
    const int64_t t_img_enc_start_us = ggml_time_us();
    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
    clip_image_f32_free(img_res);
    if (!encoded) {
        fprintf(stderr, "Unable to encode image\n");
-        return false;
+    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
            fprintf(stderr, "Unable to encode image\n");
            return false;
        }
    } else {
        // spatial_unpad llava-1.6 type embedding
        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
        std::vector<float *> image_embd_v;
        image_embd_v.resize(img_res_v.size);
        for (size_t i = 0; i < img_res_v.size; i++) {
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
        const int32_t * image_grid = clip_image_grid(ctx_clip);
        std::vector<std::pair<int, int>> grid_pinpoints;
        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }
        // free all img_res_v - not needed anymore
        delete[] img_res_v.data;
        img_res_v.size = 0;
        img_res_v.data = nullptr;
        const int32_t image_size = clip_image_size(ctx_clip);
        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
        int n_img_pos_out;
        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
            free(image_embd_v[i]);
        }
        image_embd_v.clear();
        // debug image/segment/normalization content:
        // clip_image_u8 * tmp = clip_image_u8_init();
        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }
    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@ -47,11 +311,10 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    return true;
 }
-static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
        free(image_embd);
        return false;
    }
@ -85,7 +348,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
    return true;
 }
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
@ -142,7 +405,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
    return true;
 }
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
    unsigned char* image_bytes;
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
@ -151,13 +414,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
        return NULL;
    }
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
    free(image_bytes);
    return embed;
 }
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
+void llava_image_embed_free(struct llava_image_embed * embed) {
    free(embed->embed);
    free(embed);
 }
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@ -3,7 +3,6 @@
 #include "ggml.h"
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
@ -32,6 +31,8 @@ struct llava_image_embed {
 /** sanity check for clip <-> llava embed size match */
 LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
@ -42,7 +43,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
 #ifdef __cplusplus
 }
 #endif
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -54,7 +54,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model = NULL;
    llama_context * ctx = NULL;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -1,7 +1,9 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <string>
 #include <vector>
@ -29,7 +31,8 @@ int main(int argc, char ** argv){
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model = NULL;
    llama_context * ctx = NULL;
@ -73,6 +76,8 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;
    int64_t t_draft_us = 0;
    int n_past = inp.size();
    bool has_eos = false;
@ -160,7 +165,7 @@ int main(int argc, char ** argv){
        // generate n_pred tokens through prompt lookup
        auto prompt_lookup = [&]() -> void {
-            int inp_size = inp.size();
+            const int inp_size = inp.size();
            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
                const llama_token * ngram = &inp[inp_size - ngram_size];
@ -191,8 +196,12 @@ int main(int argc, char ** argv){
            return;
        };
        const int64_t t_start_draft_us = ggml_time_us();
        prompt_lookup();
        t_draft_us += ggml_time_us() - t_start_draft_us;
        llama_decode(ctx, batch_tgt);
        ++n_past;
@ -210,6 +219,8 @@ int main(int argc, char ** argv){
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_drafted = %d\n", n_drafted);
    LOG_TEE("t_draft   = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -283,7 +283,11 @@ These options help improve the performance and memory usage of the LLaMA models.
 ### NUMA support
-   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
+-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
 -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
 -   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
 These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
 ### Memory Float 32
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -98,7 +98,7 @@ static void write_logfile(
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
-        if (!is_interacting) {
+        if (!is_interacting && g_params->interactive) {
            is_interacting = true;
        } else {
            console::cleanup();
@ -185,7 +185,8 @@ int main(int argc, char ** argv) {
    }
    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
@ -333,6 +334,8 @@ int main(int argc, char ** argv) {
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
        params.n_keep = (int)embd_inp.size();
    } else {
        params.n_keep += add_bos; // always keep the BOS token
    }
    // prefix & suffix for instruct mode
@ -382,8 +385,8 @@ int main(int argc, char ** argv) {
            }
        }
-        if (params.n_keep > 0) {
+        if (params.n_keep > add_bos) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
@ -392,7 +395,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
    }
-    if (params.interactive) {
+    // ctrl+C handling
    {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
        sigint_action.sa_handler = sigint_handler;
@ -405,7 +409,9 @@ int main(int argc, char ** argv) {
        };
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
    }
    if (params.interactive) {
        LOG_TEE("%s: interactive mode on.\n", __func__);
        if (!params.antiprompt.empty()) {
@ -536,14 +542,14 @@ int main(int argc, char ** argv) {
                        break;
                    }
-                    const int n_left    = n_past - params.n_keep - 1;
+                    const int n_left    = n_past - params.n_keep;
                    const int n_discard = n_left/2;
                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
                    n_past -= n_discard;
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -122,7 +122,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model = NULL;
    llama_context * ctx = NULL;
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -71,7 +71,8 @@ int main(int argc, char ** argv) {
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -309,7 +309,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
 }
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
@ -447,7 +447,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        return perplexity_v2(ctx, params);
    }
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
@ -1623,7 +1623,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
+        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }
@ -1809,7 +1809,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -23,14 +23,18 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",         },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
@ -237,7 +241,7 @@ int main(int argc, char ** argv) {
        params.imatrix = &imatrix_data;
    }
-    llama_backend_init(false);
+    llama_backend_init();
    // parse command line arguments
    const std::string fname_inp = argv[arg_idx];
@ -287,9 +291,10 @@ int main(int argc, char ** argv) {
        }
    }
-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
+    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
        fprintf(stderr, "\n===============================================================================================\n");
-        fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
        fprintf(stderr, "===============================================================================================\n\n\n");
        return 1;
    }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -16,6 +16,13 @@ Command line options:
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
 - `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
 - `--numa numactl`: Use the CPU map provided by numactl
 if run without this previously, it is recommended to drop the system page cache before using this
 see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--numa`: Attempt optimizations that help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -32,6 +39,12 @@ Command line options:
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 - `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - `--log-disable`: Output logs to stdout only, default: enabled.
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
 ## Build
@ -88,6 +101,12 @@ curl --request POST \
    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
 ```
 ## Advanced testing
 We implemented a [server test framework](./tests/README.md) using human-readable scenario.
 *Before submitting an issue, please try to reproduce it with this format.*
 ## Node JS Test
 You need to have [Node.js](https://nodejs.org/en) installed.
@ -125,9 +144,13 @@ node index.js
 ## API Endpoints
 - **GET** `/health`: Returns the current state of the server:
-  - `{"status": "loading model"}` if the model is still being loaded.
+  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
-  - `{"status": "error"}` if the model failed to load.
+  - 500 -> `{"status": "error"}` if the model failed to load.
-  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@ -137,7 +160,7 @@ node index.js
    `temperature`: Adjust the randomness of the generated text (default: 0.8).
-    `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
@ -185,18 +208,22 @@ node index.js
    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)
 ### Result JSON
 - Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -224,7 +251,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
 - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
@ -370,6 +397,81 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```
 - **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
 ### Result JSON
 ```json
 [
    {
        "dynatemp_exponent": 1.0,
        "dynatemp_range": 0.0,
        "frequency_penalty": 0.0,
        "grammar": "",
        "id": 0,
        "ignore_eos": false,
        "logit_bias": [],
        "min_p": 0.05000000074505806,
        "mirostat": 0,
        "mirostat_eta": 0.10000000149011612,
        "mirostat_tau": 5.0,
        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
        "n_ctx": 2048,
        "n_keep": 0,
        "n_predict": 100000,
        "n_probs": 0,
        "next_token": {
            "has_next_token": true,
            "n_remain": -1,
            "num_tokens_predicted": 0,
            "stopped_eos": false,
            "stopped_limit": false,
            "stopped_word": false,
            "stopping_word": ""
        },
        "penalize_nl": true,
        "penalty_prompt_tokens": [],
        "presence_penalty": 0.0,
        "prompt": "Say hello to llama.cpp",
        "repeat_last_n": 64,
        "repeat_penalty": 1.100000023841858,
        "samplers": [
            "top_k",
            "tfs_z",
            "typical_p",
            "top_p",
            "min_p",
            "temperature"
        ],
        "seed": 42,
        "state": 1,
        "stop": [
            "\n"
        ],
        "stream": false,
        "task_id": 0,
        "temperature": 0.0,
        "tfs_z": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
        "typical_p": 1.0,
        "use_penalty_prompt_tokens": false
    }
 ]
 ```
 - **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
 Available metrics:
 - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
 - `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
 - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
 - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
 - `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
 - `llamacpp:kv_cache_tokens`: KV-cache tokens.
 - `llamacpp:requests_processing`: Number of request processing.
 - `llamacpp:requests_deferred`: Number of request deferred.
 ## More examples
 ### Change system prompt on runtime
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@ -15,7 +15,9 @@
 using json = nlohmann::json;
 inline static json oaicompat_completion_params_parse(
-    const json &body /* openai api json semantics */)
+    const struct llama_model * model,
    const json &body, /* openai api json semantics */
    const std::string &chat_template)
 {
    json llama_params;
@ -30,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -234,6 +234,7 @@
      mirostat_eta: 0.1, // learning rate
      grammar: '',
      n_probs: 0, // no completion_probabilities,
      min_keep: 0, // min probs from each sampler,
      image_data: [],
      cache_prompt: true,
      api_key: ''
@ -791,6 +792,9 @@
            <fieldset>
              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
            <fieldset>
              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
            </fieldset>
            <fieldset>
              <label for="api_key">API Key</label>
              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -0,0 +1,47 @@
 # Server tests
 Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
 * [issues.feature](./features/issues.feature) Pending issues scenario
 * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
 * [security.feature](./features/security.feature) Security, CORS and API Key
 * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
 Tests target GitHub workflows job runners with 4 vCPU.
 Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
 Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
 ### Install dependencies
 `pip install -r requirements.txt`
 ### Run tests
 1. Build the server
 ```shell
 cd ../../..
 mkdir build
 cd build
 cmake ../
 cmake --build . --target server
 ```
 2. download required models:
   1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
 3. Start the test: `./tests.sh`
 It's possible to override some scenario steps values with environment variables:
 - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
 - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
 - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
 - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
 ### Run @bug, @wip or @wrong_usage annotated scenario
 Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
 - `@bug` annotation aims to link a scenario with a GitHub issue.
 - `@wrong_usage` are meant to show user issue that are actually an expected behavior
 - `@wip` to focus on a scenario working in progress
 To run a scenario annotated with `@bug`, start:
 `DEBUG=ON ./tests.sh --no-skipped --tags bug`
 After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@ -0,0 +1,69 @@
 import os
 import socket
 import subprocess
 import time
 from contextlib import closing
 from signal import SIGKILL
 def before_scenario(context, scenario):
    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
    port = 8080
    if 'PORT' in os.environ:
        port = int(os.environ['PORT'])
    if is_server_listening("localhost", port):
        assert False, "Server already started"
 def after_scenario(context, scenario):
    if context.server_process is None:
        return
    if scenario.status == "failed":
        if 'GITHUB_ACTIONS' in os.environ:
            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
            if os.path.isfile('llama.log'):
                with closing(open('llama.log', 'r')) as f:
                    for line in f:
                        print(line)
        if not is_server_listening(context.server_fqdn, context.server_port):
            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
    if not pid_exists(context.server_process.pid):
        assert False, f"Server not running pid={context.server_process.pid} ..."
    print(f"stopping server pid={context.server_process.pid} ...")
    context.server_process.kill()
    # Wait few for socket to free up
    time.sleep(0.05)
    attempts = 0
    while is_server_listening(context.server_fqdn, context.server_port):
        print(f"stopping server pid={context.server_process.pid} ...")
        os.kill(context.server_process.pid, SIGKILL)
        time.sleep(0.1)
        attempts += 1
        if attempts > 5:
            print(f"Server dangling exits, killing all {context.server_path} ...")
            process = subprocess.run(['killall', '-9', context.server_path],
                                     stderr=subprocess.PIPE,
                                     universal_newlines=True)
            print(process)
 def is_server_listening(server_fqdn, server_port):
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
        result = sock.connect_ex((server_fqdn, server_port))
        return result == 0
 def pid_exists(pid):
    """Check whether pid exists in the current process table."""
    import errno
    if pid < 0:
        return False
    try:
        os.kill(pid, 0)
    except OSError as e:
        return e.errno == errno.EPERM
    else:
        return True
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@ -0,0 +1,4 @@
 # List of ongoing issues
@bug
 Feature: Issues
  # No confirmed issue at the moment
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -0,0 +1,123 @@
@llama.cpp
 Feature: Parallel
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file stories260K.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
    And   64 KV cache size
    And   2 slots
    And   embeddings extraction
    And   continuous batching
    Then  the server is starting
    Then  the server is healthy
  Scenario Outline: Multi users completion
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And <n_predict> max tokens to predict
    Given concurrent completion requests
    Then the server is busy
    Then the server is idle
    And  all slots are idle
    Then all prompts are predicted with <n_predict> tokens
    Examples:
      | n_predict |
      | 128       |
  Scenario Outline: Multi users OAI completions compatibility
    Given a system prompt You are a writer.
    And   a model tinyllama-2
    Given a prompt:
      """
      Write a very long book.
      """
    And a prompt:
      """
      Write another a poem.
      """
    And <n_predict> max tokens to predict
    And streaming is <streaming>
    Given concurrent OAI completions requests
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted with <n_predict> tokens
    Examples:
      | streaming | n_predict |
      | disabled  | 128       |
      | enabled   | 64        |
  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And a prompt:
      """
      Write a very long poem.
      """
    And a prompt:
      """
      Write a very long joke.
      """
    And 128 max tokens to predict
    Given concurrent completion requests
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted
  Scenario: Multi users embeddings
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And a prompt:
      """
      Write a very long poem.
      """
    And a prompt:
      """
      Write a very long joke.
      """
    Given concurrent embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
  Scenario: Multi users OAI compatibility embeddings
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    And a prompt:
      """
      What is the biggest US city ?
      """
    And a prompt:
      """
      What is the capital of Bulgaria ?
      """
    And   a model tinyllama-2
    Given concurrent OAI embedding requests
    Then the server is busy
    Then the server is idle
    Then all embeddings are generated
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@ -0,0 +1,50 @@
@llama.cpp
 Feature: Security
  Background: Server startup with an api key defined
    Given a server listening on localhost:8080
    And   a model file stories260K.gguf
    And   a server api key llama.cpp
    Then  the server is starting
    Then  the server is healthy
  Scenario Outline: Completion with some user api key
    Given a prompt test
    And   a user api key <api_key>
    And   4 max tokens to predict
    And   a completion request with <api_error> api error
    Examples: Prompts
      | api_key   | api_error |
      | llama.cpp | no        |
      | llama.cpp | no        |
      | hackeme   | raised    |
      |           | raised    |
  Scenario Outline: OAI Compatibility
    Given a system prompt test
    And   a user prompt test
    And   a model test
    And   2 max tokens to predict
    And   streaming is disabled
    And   a user api key <api_key>
    Given an OAI compatible chat completions request with <api_error> api error
    Examples: Prompts
      | api_key   | api_error |
      | llama.cpp | no        |
      | llama.cpp | no        |
      | hackme    | raised    |
  Scenario Outline: CORS Options
    When an OPTIONS request is sent from <origin>
    Then CORS header <cors_header> is set to <cors_header_value>
    Examples: Headers
      | origin          | cors_header                      | cors_header_value |
      | localhost       | Access-Control-Allow-Origin      | localhost         |
      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
      | origin          | Access-Control-Allow-Credentials | true              |
      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -0,0 +1,84 @@
@llama.cpp
 Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file stories260K.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
      # that can be stored across all independent sequences: #4130
      # see --ctx-size and #5568
    And   32 KV cache size
    And   1 slots
    And   embeddings extraction
    And   32 server max tokens to predict
    And   prometheus compatible metrics exposed
    Then  the server is starting
    Then  the server is healthy
  Scenario: Health
    Then the server is ready
    And  all slots are idle
  Scenario Outline: Completion
    Given a prompt <prompt>
    And   <n_predict> max tokens to predict
    And   a completion request with no api error
    Then  <n_predicted> tokens are predicted matching <re_content>
    And   prometheus metrics are exposed
    Examples: Prompts
      | prompt                           | n_predict | re_content                             | n_predicted |
      | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           |
      | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          |
  Scenario Outline: OAI Compatibility
    Given a model <model>
    And   a system prompt <system_prompt>
    And   a user prompt <user_prompt>
    And   <max_tokens> max tokens to predict
    And   streaming is <enable_streaming>
    Given an OAI compatible chat completions request with no api error
    Then  <n_predicted> tokens are predicted matching <re_content>
    Examples: Prompts
      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                 | n_predicted | enable_streaming |
      | llama-2      | Book                        | What is the best book                | 8          | (Mom<or>what)+             | 8           | disabled         |
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks<or>happy<or>bird)+ | 32          | enabled          |
  Scenario: Embedding
    When embeddings are computed for:
    """
    What is the capital of Bulgaria ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility
    Given a model tinyllama-2
    When an OAI compatible embeddings computation request for:
    """
    What is the capital of Spain ?
    """
    Then embeddings are generated
  Scenario: OAI Embeddings compatibility with multiple inputs
    Given a model tinyllama-2
    Given a prompt:
      """
      In which country Paris is located ?
      """
    And a prompt:
      """
      Is Madrid the capital of Spain ?
      """
    When an OAI compatible embeddings computation request for multiple inputs
    Then embeddings are generated
  Scenario: Tokenize / Detokenize
    When tokenizing:
    """
    What is the capital of France ?
    """
    Then tokens can be detokenize
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -0,0 +1,801 @@
 import asyncio
 import collections
 import json
 import os
 import re
 import socket
 import subprocess
 import time
 from contextlib import closing
 from re import RegexFlag
 import aiohttp
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser
@step(u"a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn, server_port):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
    if 'PORT' in os.environ:
        context.server_port = int(os.environ['PORT'])
        print(f"$PORT set, overriding server port with to {context.server_port}")
    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
    context.model_alias = None
    context.n_ctx = None
    context.n_predict = None
    context.n_server_predict = None
    context.n_slots = None
    context.server_api_key = None
    context.server_continuous_batching = False
    context.server_embeddings = False
    context.server_metrics = False
    context.server_process = None
    context.server_seed = None
    context.user_api_key = None
    context.tasks_result = []
    context.concurrent_tasks = []
    context.prompts = []
@step(u'a model file {model_file}')
 def step_model_file(context, model_file):
    context.model_file = model_file
@step(u'a model alias {model_alias}')
 def step_model_alias(context, model_alias):
    context.model_alias = model_alias
@step(u'{seed} as server seed')
 def step_seed(context, seed):
    context.server_seed = int(seed)
@step(u'{n_ctx} KV cache size')
 def step_n_ctx(context, n_ctx):
    context.n_ctx = int(n_ctx)
@step(u'{n_slots} slots')
 def step_n_slots(context, n_slots):
    context.n_slots = int(n_slots)
@step(u'{n_predict} server max tokens to predict')
 def step_server_n_predict(context, n_predict):
    context.n_server_predict = int(n_predict)
@step(u'continuous batching')
 def step_server_continuous_batching(context):
    context.server_continuous_batching = True
@step(u'embeddings extraction')
 def step_server_embeddings(context):
    context.server_embeddings = True
@step(u'prometheus compatible metrics exposed')
 def step_server_metrics(context):
    context.server_metrics = True
@step(u"the server is starting")
 def step_start_server(context):
    start_server_background(context)
    attempts = 0
    while True:
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
            result = sock.connect_ex((context.server_fqdn, context.server_port))
            if result == 0:
                print("\x1b[33;46mserver started!\x1b[0m")
                return
            attempts += 1
            if attempts > 20:
                assert False, "server not started"
            print(f"waiting for server to start, connect error code = {result}...")
            time.sleep(0.1)
@step(u"the server is {expecting_status}")
@async_run_until_complete
 async def step_wait_for_the_server_to_be_started(context, expecting_status):
    match expecting_status:
        case 'healthy':
            await wait_for_health_status(context, context.base_url, 200, 'ok')
        case 'ready' | 'idle':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
                                         slots_idle=context.n_slots,
                                         slots_processing=0,
                                         expected_slots=[{'id': slot_id, 'state': 0}
                                                         for slot_id in range(context.n_slots)])
        case 'busy':
            await wait_for_health_status(context, context.base_url, 503,
                                         'no slot available',
                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
                                         slots_idle=0,
                                         slots_processing=context.n_slots,
                                         expected_slots=[{'id': slot_id, 'state': 1}
                                                         for slot_id in range(context.n_slots)])
        case _:
            assert False, "unknown status"
@step(u'all slots are {expected_slot_status_string}')
@async_run_until_complete
 async def step_all_slots_status(context, expected_slot_status_string):
    match expected_slot_status_string:
        case 'idle':
            expected_slot_status = 0
        case 'busy':
            expected_slot_status = 1
        case _:
            assert False, "unknown status"
    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
                      for slot_id in range(context.n_slots)]
    await request_slots_status(context, expected_slots)
@step(u'a completion request with {api_error} api error')
@async_run_until_complete
 async def step_request_completion(context, api_error):
    expect_api_error = api_error == 'raised'
    completion = await request_completion(context.prompts.pop(),
                                          context.base_url,
                                          debug=context.debug,
                                          n_predict=context.n_predict,
                                          server_seed=context.server_seed,
                                          expect_api_error=expect_api_error,
                                          user_api_key=context.user_api_key)
    context.tasks_result.append(completion)
    if context.debug:
        print(f"Completion response: {completion}")
    if expect_api_error:
        assert completion == 401, f"completion must be an 401 status code: {completion}"
@step(u'{predicted_n} tokens are predicted matching {re_content}')
 def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
    assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content)
@step(u'{predicted_n} tokens are predicted')
 def step_n_tokens_predicted(context, predicted_n):
    assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n))
@step(u'a user prompt {user_prompt}')
 def step_user_prompt(context, user_prompt):
    context.prompts.append(user_prompt)
@step(u'a system prompt {system_prompt}')
 def step_system_prompt(context, system_prompt):
    context.system_prompt = system_prompt
@step(u'a model {model}')
 def step_model(context, model):
    context.model = model
@step(u'{max_tokens} max tokens to predict')
 def step_max_tokens(context, max_tokens):
    context.n_predict = int(max_tokens)
@step(u'streaming is {enable_streaming}')
 def step_streaming(context, enable_streaming):
    context.enable_streaming = enable_streaming == 'enabled'
@step(u'a user api key {user_api_key}')
 def step_user_api_key(context, user_api_key):
    context.user_api_key = user_api_key
@step(u'no user api key')
 def step_no_user_api_key(context):
    context.user_api_key = None
@step(u'a user api key ')
 def step_no_user_api_key_space(context):
    context.user_api_key = None
@step(u'a server api key {server_api_key}')
 def step_server_api_key(context, server_api_key):
    context.server_api_key = server_api_key
@step(u'an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete
 async def step_oai_chat_completions(context, api_error):
    if context.debug:
        print(f"Submitting OAI compatible completions request...")
    expect_api_error = api_error == 'raised'
    completion = await oai_chat_completions(context.prompts.pop(),
                                            context.system_prompt,
                                            context.base_url,
                                            False,
                                            model=context.model if hasattr(context, 'model') else None,
                                            n_predict=context.n_predict
                                            if hasattr(context, 'n_predict') else None,
                                            enable_streaming=context.enable_streaming
                                            if hasattr(context, 'enable_streaming') else None,
                                            server_seed=context.server_seed
                                            if hasattr(context, 'server_seed') else None,
                                            user_api_key=context.user_api_key
                                            if hasattr(context, 'user_api_key') else None,
                                            expect_api_error=expect_api_error)
    context.tasks_result.append(completion)
    if context.debug:
        print(f"Completion response: {completion}")
    if expect_api_error:
        assert completion == 401, f"completion must be an 401 status code: {completion}"
    if context.debug:
        print(f"Completion response: {completion}")
@step(u'a prompt')
 def step_a_prompt(context):
    context.prompts.append(context.text)
@step(u'a prompt {prompt}')
 def step_a_prompt_prompt(context, prompt):
    context.prompts.append(prompt)
@step(u'concurrent completion requests')
@async_run_until_complete()
 async def step_concurrent_completion_requests(context):
    await concurrent_requests(context,
                              request_completion,
                              # prompt is inserted automatically
                              context.base_url,
                              debug=context.debug,
                              n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
                              server_seed=context.server_seed if hasattr(context, 'server_seed') else None,
                              user_api_key=context.user_api_key if hasattr(context,
                                                                           'user_api_key') else None)
@step(u'concurrent OAI completions requests')
@async_run_until_complete
 async def step_oai_chat_completions(context):
    await concurrent_requests(context, oai_chat_completions,
                              # user_prompt is inserted automatically
                              context.system_prompt,
                              context.base_url,
                              True,  # async_client
                              model=context.model
                              if hasattr(context, 'model') else None,
                              n_predict=context.n_predict
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
                              server_seed=context.server_seed
                              if hasattr(context, 'server_seed') else None,
                              user_api_key=context.user_api_key
                              if hasattr(context, 'user_api_key') else None)
@step(u'all prompts are predicted')
@async_run_until_complete
 async def step_all_prompts_are_predicted(context):
    await all_prompts_are_predicted(context)
@step(u'all prompts are predicted with {n_predict} tokens')
@async_run_until_complete
 async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict):
    expected_predicted_n = int(n_predict)
    await all_prompts_are_predicted(context, expected_predicted_n)
 async def all_prompts_are_predicted(context, expected_predicted_n=None):
    n_completions = await gather_tasks_results(context)
    assert n_completions > 0
    for i in range(n_completions):
        assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
    assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
@step(u'embeddings are computed for')
@async_run_until_complete
 async def step_compute_embedding(context):
    context.embeddings = await request_embedding(context.text, base_url=context.base_url)
@step(u'embeddings are generated')
 def step_assert_embeddings(context):
    if len(context.prompts) == 0:
        assert_embeddings(context.embeddings)
    else:
        assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
                                                                 f"context.prompts={context.prompts}\n"
                                                                 f"context.embeddings={context.embeddings}")
        for embedding in context.embeddings:
            context.prompts.pop()
            assert_embeddings(embedding)
@step(u'an OAI compatible embeddings computation request for')
@async_run_until_complete
 async def step_oai_compute_embeddings(context):
    context.embeddings = await request_oai_embeddings(context.text,
                                                      base_url=context.base_url,
                                                      user_api_key=context.user_api_key,
                                                      model=context.model)
@step(u'an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete
 async def step_oai_compute_embeddings_multiple_inputs(context):
    context.embeddings = await request_oai_embeddings(context.prompts,
                                                      base_url=context.base_url,
                                                      user_api_key=context.user_api_key,
                                                      model=context.model)
@step(u'concurrent embedding requests')
@async_run_until_complete()
 async def step_concurrent_embedding_requests(context):
    await concurrent_requests(context,
                              request_embedding,
                              # prompt is inserted automatically
                              base_url=context.base_url)
@step(u'concurrent OAI embedding requests')
@async_run_until_complete()
 async def step_concurrent_oai_embedding_requests(context):
    await concurrent_requests(context,
                              request_oai_embeddings,
                              # prompt is inserted automatically
                              base_url=context.base_url,
                              async_client=True,
                              model=context.model)
@step(u'all embeddings are generated')
@async_run_until_complete()
 async def all_embeddings_are_generated(context):
    n_embedding_requests = await gather_tasks_results(context)
    assert n_embedding_requests > 0
    for i in range(n_embedding_requests):
        assert_embeddings(context.tasks_result.pop())
@step(u'tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
    context.tokenized_text = context.text
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{context.base_url}/tokenize',
                                json={
                                    "content": context.tokenized_text,
                                }) as response:
            assert response.status == 200
            tokenize_json = await response.json()
            context.tokens = tokenize_json['tokens']
@step(u'tokens can be detokenize')
@async_run_until_complete
 async def step_detokenize(context):
    assert len(context.tokens) > 0
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{context.base_url}/detokenize',
                                json={
                                    "tokens": context.tokens,
                                }) as response:
            assert response.status == 200
            detokenize_json = await response.json()
            # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
            assert context.tokenized_text == detokenize_json['content'].strip()
@step(u'an OPTIONS request is sent from {origin}')
@async_run_until_complete
 async def step_options_request(context, origin):
    async with aiohttp.ClientSession() as session:
        async with session.options(f'{context.base_url}/v1/chat/completions',
                                   headers={"Origin": origin}) as response:
            assert response.status == 200
            context.options_response = response
@step(u'CORS header {cors_header} is set to {cors_header_value}')
 def step_check_options_header_value(context, cors_header, cors_header_value):
    assert context.options_response.headers[cors_header] == cors_header_value
@step(u'prometheus metrics are exposed')
@async_run_until_complete
 async def step_prometheus_metrics_exported(context):
    async with aiohttp.ClientSession() as session:
        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
            assert metrics_response.status == 200
            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
            metrics_raw = await metrics_response.text()
            metric_exported = False
            for metric in parser.text_string_to_metric_families(metrics_raw):
                match metric.name:
                    case "llamacpp:kv_cache_usage_ratio":
                        assert len(metric.samples) > 0
                        metric_exported = True
            assert metric_exported, "No metrics exported"
 async def concurrent_requests(context, f_completion, *args, **kwargs):
    n_prompts = len(context.prompts)
    if context.debug:
        print(f"starting {n_prompts} concurrent completion requests...")
    assert n_prompts > 0
    for prompt_no in range(n_prompts):
        shifted_args = [context.prompts.pop(), *args]
        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
    await asyncio.sleep(0.1)
 async def request_completion(prompt,
                             base_url,
                             debug=False,
                             n_predict=None,
                             server_seed=None,
                             expect_api_error=None,
                             user_api_key=None):
    if debug:
        print(f"Sending completion request: {prompt}")
    origin = "my.super.domain"
    headers = {
        'Origin': origin
    }
    if user_api_key is not None:
        if debug:
            print(f"Set user_api_key: {user_api_key}")
        headers['Authorization'] = f'Bearer {user_api_key}'
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{base_url}/completion',
                                json={
                                    "prompt": prompt,
                                    "n_predict": int(n_predict) if n_predict is not None else -1,
                                    "seed": server_seed if server_seed is not None else 42
                                },
                                headers=headers) as response:
            if expect_api_error is None or not expect_api_error:
                assert response.status == 200
                assert response.headers['Access-Control-Allow-Origin'] == origin
                return await response.json()
            else:
                return response.status
 async def oai_chat_completions(user_prompt,
                               system_prompt,
                               base_url,
                               async_client,
                               debug=False,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
                               server_seed=None,
                               user_api_key=None,
                               expect_api_error=None):
    if debug:
        print(f"Sending OAI Chat completions request: {user_prompt}")
    # openai client always expects an api key
    user_api_key = user_api_key if user_api_key is not None else 'nope'
    seed = server_seed if server_seed is not None else 42
    enable_streaming = enable_streaming if enable_streaming is not None else False
    payload = {
        "messages": [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            }
        ],
        "model": model,
        "max_tokens": n_predict,
        "stream": enable_streaming,
        "seed": seed
    }
    completion_response = {
        'content': '',
        'timings': {
            'predicted_n': 0
        }
    }
    if async_client:
        origin = 'llama.cpp'
        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
        async with aiohttp.ClientSession() as session:
            async with session.post(f'{base_url}/v1/chat/completions',
                                    json=payload,
                                    headers=headers) as response:
                if enable_streaming:
                    assert response.status == 200
                    assert response.headers['Access-Control-Allow-Origin'] == origin
                    assert response.headers['Content-Type'] == "text/event-stream"
                    event_received = True
                    while event_received:
                        event_received = False
                        async for line_in_bytes in response.content:
                            line = line_in_bytes.decode('utf8')
                            line = line.rstrip('\n').rstrip('\r')
                            if line == '':
                                continue
                            event_data = line.split(': ', 1)
                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
                            chunk_raw = event_data[1]
                            chunk = json.loads(chunk_raw)
                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
                            delta = chunk['choices'][0]['delta']
                            if 'content' in delta:
                                completion_response['content'] += delta['content']
                                completion_response['timings']['predicted_n'] += 1
                else:
                    if expect_api_error is None or not expect_api_error:
                        assert response.status == 200
                        assert response.headers['Access-Control-Allow-Origin'] == origin
                        assert response.headers['Content-Type'] == "application/json; charset=utf-8"
                        chat_completion_raw = await response.json()
                        completion_response = {
                            'content': chat_completion_raw['choices'][0]['message'],
                            'timings': {
                                'predicted_n': chat_completion_raw['usage']['completion_tokens']
                            }
                        }
                    else:
                        return response.status
    else:
        try:
            openai.api_key = user_api_key
            openai.api_base = f'{base_url}/v1/chat'
            chat_completion = openai.Completion.create(
                messages=payload['messages'],
                model=model,
                max_tokens=n_predict,
                stream=enable_streaming,
                seed=seed
            )
        except openai.error.APIError as e:
            if expect_api_error is not None and expect_api_error:
                return 401
            else:
                assert False, f'error raised: {e}'
        if enable_streaming:
            for chunk in chat_completion:
                assert len(chunk.choices) == 1
                delta = chunk.choices[0].delta
                if 'content' in delta:
                    completion_response['content'] += delta['content']
                    completion_response['timings']['predicted_n'] += 1
        else:
            assert len(chat_completion.choices) == 1
            completion_response = {
                'content': chat_completion.choices[0].message.content,
                'timings': {
                    'predicted_n': chat_completion.usage.completion_tokens
                }
            }
    if debug:
        print("OAI response formatted to llama.cpp:", completion_response)
    return completion_response
 async def request_embedding(content, base_url=None):
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{base_url}/embedding',
                                json={
                                    "content": content,
                                }) as response:
            assert response.status == 200
            response_json = await response.json()
            return response_json['embedding']
 async def request_oai_embeddings(input,
                                 base_url=None, user_api_key=None,
                                 model=None, async_client=False):
    # openai client always expects an api_key
    user_api_key = user_api_key if user_api_key is not None else 'nope'
    if async_client:
        origin = 'llama.cpp'
        if user_api_key is not None:
            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
        async with aiohttp.ClientSession() as session:
            async with session.post(f'{base_url}/v1/embeddings',
                                    json={
                                        "input": input,
                                        "model": model,
                                    },
                                    headers=headers) as response:
                assert response.status == 200, f"received status code not expected: {response.status}"
                assert response.headers['Access-Control-Allow-Origin'] == origin
                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
                response_json = await response.json()
                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
                assert response_json['object'] == 'list'
                return response_json['data']
    else:
        openai.api_key = user_api_key
        openai.api_base = f'{base_url}/v1'
        oai_embeddings = openai.Embedding.create(
            model=model,
            input=input,
        )
        if isinstance(input, collections.abc.Sequence):
            embeddings = []
            for an_oai_embeddings in oai_embeddings.data:
                embeddings.append(an_oai_embeddings.embedding)
        else:
            embeddings = oai_embeddings.data.embedding
        return embeddings
 def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
    content = completion_response['content']
    n_predicted = completion_response['timings']['predicted_n']
    assert len(content) > 0, "no token predicted"
    if expected_predicted_n is not None:
        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
                                                     f' {n_predicted} <> {expected_predicted_n}')
    if re_content is not None:
        re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
        assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
            f'invalid tokens predicted:'
            f' ```\n{content}\n``` do not match /{re_content}/')
 async def gather_tasks_results(context):
    n_tasks = len(context.concurrent_tasks)
    if context.debug:
        print(f"Waiting for all {n_tasks} tasks results...")
    for task_no in range(n_tasks):
        context.tasks_result.append(await context.concurrent_tasks.pop())
    n_completions = len(context.tasks_result)
    return n_completions
 async def wait_for_health_status(context,
                                 base_url,
                                 expected_http_status_code,
                                 expected_health_status,
                                 params=None,
                                 slots_idle=None,
                                 slots_processing=None,
                                 expected_slots=None):
    if context.debug:
        print(f"Starting checking for health for expected_health_status={expected_health_status}")
    timeout = 3  # seconds
    interval = 0.5
    counter = 0
    async with aiohttp.ClientSession() as session:
        while True:
            async with await session.get(f'{base_url}/health', params=params) as health_response:
                status_code = health_response.status
                health = await health_response.json()
                if context.debug:
                    print(f"HEALTH - response for expected health status='{expected_health_status}' on "
                          f"'{base_url}/health'?{params} is {health}")
                if (status_code == expected_http_status_code
                        and health['status'] == expected_health_status
                        and (slots_idle is None or health['slots_idle'] == slots_idle)
                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
                    if expected_slots is not None:
                        assert_slots_status(health['slots'], expected_slots)
                    return
                if (status_code == expected_http_status_code
                        and health['status'] == expected_health_status
                        and (slots_idle is None or health['slots_idle'] == slots_idle)
                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
                    if expected_slots is not None:
                        assert_slots_status(health['slots'], expected_slots)
                    return
            await asyncio.sleep(interval)
            counter += interval
            if counter >= timeout:
                # Sometimes health requests are triggered after completions are predicted
                if expected_http_status_code == 503:
                    if len(context.tasks_result) == 0:
                        print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
                              " busy health check missed, probably too fast inference\x1b[0m")
                        n_completions = await gather_tasks_results(context)
                        if n_completions > 0:
                            return
                assert False, 'timeout exceeded'
 def assert_embeddings(embeddings):
    assert len(embeddings) > 0
    embeddings_computed = False
    for emb in embeddings:
        if emb != 0:
            embeddings_computed = True
    assert embeddings_computed, f"Embeddings: {embeddings}"
 async def request_slots_status(context, expected_slots):
    async with aiohttp.ClientSession() as session:
        async with await session.get(f'{context.base_url}/slots') as slots_response:
            assert slots_response.status == 200
            slots = await slots_response.json()
            assert_slots_status(slots, expected_slots)
 def assert_slots_status(slots, expected_slots):
    assert len(slots) == len(expected_slots)
    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
        for key in expected:
            assert expected[key] == slot[key], (f"invalid slot {slot_id}"
                                                f" expected[{key}] != slot[{key}]"
                                                f" = {expected[key]} != {slot[key]}")
 def start_server_background(context):
    context.server_path = '../../../build/bin/server'
    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_args = [
        '--host', context.server_fqdn,
        '--port', context.server_port,
        '--model', context.model_file
    ]
    if context.server_continuous_batching:
        server_args.append('--cont-batching')
    if context.server_embeddings:
        server_args.append('--embedding')
    if context.server_metrics:
        server_args.append('--metrics')
    if context.model_alias is not None:
        server_args.extend(['--alias', context.model_alias])
    if context.n_ctx is not None:
        server_args.extend(['--ctx-size', context.n_ctx])
    if context.n_slots is not None:
        server_args.extend(['--parallel', context.n_slots])
    if context.n_server_predict is not None:
        server_args.extend(['--n-predict', context.n_server_predict])
    if context.server_api_key is not None:
        server_args.extend(['--api-key', context.server_api_key])
    if context.debug:
        server_args.append('--verbose')
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
    print(f"starting server with: {context.server_path}", *server_args)
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
        close_fds=True)
    print(f"server pid={context.server_process.pid}")
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@ -0,0 +1,21 @@
 # run with ./test.sh --tags wrong_usage
@wrong_usage
 Feature: Wrong usage of llama.cpp server
  #3969 The user must always set --n-predict option
  # to cap the number of tokens any completion request can generate
  # or pass n_predict/max_tokens in the request.
  Scenario: Infinite loop
    Given a server listening on localhost:8080
    And   a model file stories260K.gguf
    # Uncomment below to fix the issue
    #And   64 server max tokens to predict
    Then  the server is starting
    Given a prompt:
      """
      Go to: infinite loop
      """
    # Uncomment below to fix the issue
    #And   128 max tokens to predict
    Given concurrent completion requests
    Then all prompts are predicted
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -0,0 +1,4 @@
 aiohttp~=3.9.3
 behave~=1.2.6
 openai~=0.25.0
 prometheus-client~=0.20.0
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -0,0 +1,12 @@
 #!/bin/bash
 set -eu
 if [ $# -lt 1 ]
 then
  # Start @llama.cpp scenario
  behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp
 else
  behave "$@"
 fi
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -14,6 +14,7 @@
 using json = nlohmann::json;
 extern bool server_verbose;
 extern bool server_log_json;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@ -27,14 +28,14 @@ extern bool server_verbose;
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
 //
 // parallel
@ -49,7 +50,8 @@ enum server_state {
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE
+    TASK_TYPE_NEXT_RESPONSE,
    TASK_TYPE_METRICS
 };
 struct task_server {
@ -132,26 +134,48 @@ struct completion_token_output
    std::string text_to_send;
 };
-static inline void server_log(const char *level, const char *function, int line,
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
                       const char *message, const nlohmann::ordered_json &extra)
 {
-    nlohmann::ordered_json log
+    std::stringstream ss_tid;
-    {
+    ss_tid << std::this_thread::get_id();
    json log = nlohmann::ordered_json{
        {"tid", ss_tid.str()},
        {"timestamp", time(nullptr)},
        {"level",     level},
        {"function",  function},
        {"line",      line},
        {"message",   message},
    };
-    if (!extra.empty())
+    if (server_log_json) {
-    {
+        log.merge_patch(
-        log.merge_patch(extra);
+                {
-    }
+                        {"level",     level},
                        {"function",  function},
                        {"line",      line},
                        {"msg",       message},
                });
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
-    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
-    printf("%.*s\n", (int)str.size(), str.data());
+    } else {
-    fflush(stdout);
+        char buf[1024];
        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
        if (!extra.empty()) {
            log.merge_patch(extra);
        }
        std::stringstream ss;
        ss << buf << " |";
        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str());
            ss << buf;
        }
        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
        fflush(stdout);
    }
 }
 //
@ -167,20 +191,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
        : default_value;
 }
-inline std::string format_chatml(std::vector<json> messages)
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-{
+inline bool verify_custom_template(const std::string & tmpl) {
-    std::ostringstream chatml_msgs;
+    llama_chat_message chat[] = {{"user", "test"}};
    std::vector<char> buf(1);
    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
    return res >= 0;
 }
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
+// Format given chat. If tmpl is empty, we take the template from model metadata
-        chatml_msgs << "<|im_start|>"
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
-                    << json_value(*it, "role",    std::string("user")) << '\n';
+{
-        chatml_msgs << json_value(*it, "content", std::string(""))
+    size_t alloc_size = 0;
-                    << "<|im_end|>\n";
+    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        auto &curr_msg = messages[i];
        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
        alloc_size     += str[i*2 + 1].length();
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
-    chatml_msgs << "<|im_start|>assistant" << '\n';
+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
    std::vector<char> buf(alloc_size * 2);
-    return chatml_msgs.str();
+    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
 //
@ -190,6 +241,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
@ -205,6 +257,7 @@ struct llama_server_queue {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
@ -220,7 +273,9 @@ struct llama_server_queue {
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
-        return id++;
+        int new_id = id++;
        LOG_VERBOSE("new task id", {{"new_id", new_id}});
        return new_id;
    }
    // Register function to process a new task
@ -248,12 +303,20 @@ struct llama_server_queue {
        queue_tasks_deferred.clear();
    }
-    // Start the main loop. This call is blocking
+    // end the start_loop routine
-    [[noreturn]]
+    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    // Start the main loop.
    void start_loop() {
        running = true;
        while (true) {
-            // new task arrived
+            LOG_VERBOSE("new task may arrive", {});
            LOG_VERBOSE("have new task", {});
            {
                while (true)
                {
@ -265,7 +328,7 @@ struct llama_server_queue {
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {});
+                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
                    callback_new_task(task);
                }
                LOG_VERBOSE("callback_all_task_finished", {});
@ -294,8 +357,12 @@ struct llama_server_queue {
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
+                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
@ -342,11 +409,13 @@ struct llama_server_response {
    std::condition_variable condition_results;
    void add_waiting_task_id(int task_id) {
        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    void remove_waiting_task_id(int task_id) {
        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
@ -359,7 +428,6 @@ struct llama_server_response {
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            LOG_VERBOSE("condition_results unblock", {});
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
@ -384,22 +452,22 @@ struct llama_server_response {
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {});
+        LOG_VERBOSE("send new result", {{"task_id", result.id}});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
-                LOG_VERBOSE("callback_update_multitask", {});
+                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
-                LOG_VERBOSE("queue_results.push_back", {});
+                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
                queue_results.push_back(result);
-                condition_results.notify_one();
+                condition_results.notify_all();
                return;
            }
        }
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -31,7 +31,8 @@ int main(int argc, char ** argv) {
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model_tgt = NULL;
    llama_model * model_dft = NULL;
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
    const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
-    llama_backend_init(false);
+    llama_backend_init();
    llama_model_params model_params = llama_model_default_params();
    model_params.vocab_only = true;
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "common.h"
 #include "train.h"
 #include "llama.h"
@ -19,8 +20,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static const size_t tensor_alignment = 32;
 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;
@ -51,14 +50,14 @@ struct my_llama_layer {
    struct ggml_tensor * ffn_norm;
    // ff
-    struct ggml_tensor * w1;
+    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * w2;
+    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * w3;
+    struct ggml_tensor * ffn_up;   // w3
 };
 struct my_llama_model {
    struct ggml_context * ctx = NULL;
-    std::vector<uint8_t> data;
+    ggml_backend_buffer_t data = NULL;
    my_llama_hparams hparams;
@ -112,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 static void set_param_model(struct my_llama_model * model) {
@ -141,42 +140,9 @@ static void set_param_model(struct my_llama_model * model) {
        ggml_set_param(ctx, layer.wv);
        ggml_set_param(ctx, layer.wo);
        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.w1);
+        ggml_set_param(ctx, layer.ffn_gate);
-        ggml_set_param(ctx, layer.w2);
+        ggml_set_param(ctx, layer.ffn_down);
-        ggml_set_param(ctx, layer.w3);
+        ggml_set_param(ctx, layer.ffn_up);
    }
 }
 static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * model) {
    ggml_allocr_alloc(alloc, model->tok_embeddings);
    ggml_allocr_alloc(alloc, model->norm);
    ggml_allocr_alloc(alloc, model->output);
    for (uint32_t i = 0; i < model->layers.size(); ++i) {
        auto & layer = model->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm);
        ggml_allocr_alloc(alloc, layer.wq);
        ggml_allocr_alloc(alloc, layer.wk);
        ggml_allocr_alloc(alloc, layer.wv);
        ggml_allocr_alloc(alloc, layer.wo);
        ggml_allocr_alloc(alloc, layer.ffn_norm);
        ggml_allocr_alloc(alloc, layer.w1);
        ggml_allocr_alloc(alloc, layer.w2);
        ggml_allocr_alloc(alloc, layer.w3);
    }
    ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
    ggml_allocr_alloc(alloc, model->norm->grad);
    ggml_allocr_alloc(alloc, model->output->grad);
    for (uint32_t i = 0; i < model->layers.size(); ++i) {
        auto & layer = model->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm->grad);
        ggml_allocr_alloc(alloc, layer.wq->grad);
        ggml_allocr_alloc(alloc, layer.wk->grad);
        ggml_allocr_alloc(alloc, layer.wv->grad);
        ggml_allocr_alloc(alloc, layer.wo->grad);
        ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
        ggml_allocr_alloc(alloc, layer.w1->grad);
        ggml_allocr_alloc(alloc, layer.w2->grad);
        ggml_allocr_alloc(alloc, layer.w3->grad);
    }
 }
@ -232,9 +198,9 @@ static void init_model(struct my_llama_model * model) {
        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
+        layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
+        layer.ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
@ -245,24 +211,15 @@ static void init_model(struct my_llama_model * model) {
        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));
-        ggml_set_name(layer.w1,             tni(LLM_TENSOR_FFN_GATE, i));
+        ggml_set_name(layer.ffn_gate,       tni(LLM_TENSOR_FFN_GATE, i));
-        ggml_set_name(layer.w2,             tni(LLM_TENSOR_FFN_DOWN, i));
+        ggml_set_name(layer.ffn_down,       tni(LLM_TENSOR_FFN_DOWN, i));
-        ggml_set_name(layer.w3,             tni(LLM_TENSOR_FFN_UP, i));
+        ggml_set_name(layer.ffn_up,         tni(LLM_TENSOR_FFN_UP, i));
    }
    set_param_model(model);
    // measure data size
    size_t size = 0;
    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
    }
    // allocate data
-    struct ggml_allocr * alloc = NULL;
+    model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
    model->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
    alloc_model(alloc, model);
 }
 static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@ -287,9 +244,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
        randomize_tensor_normal(layer.ffn_norm, rnd);
-        randomize_tensor_normal(layer.w1, rnd);
+        randomize_tensor_normal(layer.ffn_gate, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
+        randomize_tensor_normal(layer.ffn_down, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
+        randomize_tensor_normal(layer.ffn_up,   rnd);
    }
    free_random_normal_distribution(rnd);
@ -297,7 +254,7 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
 static struct ggml_tensor * llama_build_train_graphs(
        struct my_llama_model * model,
-        struct ggml_allocr    * alloc,
+        ggml_gallocr_t          alloc,
        struct ggml_context   * ctx,
        struct ggml_cgraph    * gf,
        struct ggml_cgraph    * gb,
@ -308,7 +265,8 @@ static struct ggml_tensor * llama_build_train_graphs(
        const  int              n_tokens,
        const  int              n_batch,
        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing) {
+        const  bool             enable_checkpointing,
        const  bool             measure_only) {
    ggml_set_scratch(ctx, { 0, 0, nullptr, });
    const int n_past = 0;
@ -334,13 +292,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(alloc, KQ_pos);
+    ggml_set_input(KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
        }
    }
    // rope has so much parameters that we make a custom function for it
    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -404,11 +356,11 @@ static struct ggml_tensor * llama_build_train_graphs(
        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.w3, t24);                          set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.ffn_up, t24);                      set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.w1, t24);                          set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.ffn_gate, t24);                    set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.w2, t28);                          set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.ffn_down, t28);                    set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
        cur = t30;
        checkpoints.push_back(cur);
@ -448,21 +400,31 @@ static struct ggml_tensor * llama_build_train_graphs(
        // KQ_pos
        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-
+        ggml_set_input(t36->grad);
        ggml_allocr_alloc(alloc, t36->grad);
        // allocating checkpoints in one block to reduce memory fragmentation
        // note: they will be freed in reverse order
        for (int i = 0; i < (int) checkpoints.size(); ++i) {
            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_allocr_alloc(alloc, checkpoints[i]);
+                ggml_set_input(checkpoints[i]);
            }
        }
        //int n_leafs_after = gb->n_leafs;
        //int n_nodes_after = gb->n_nodes;
        if (measure_only) {
            // FIXME: will still allocate
            ggml_gallocr_reserve(alloc, gb);
        } else {
            ggml_gallocr_alloc_graph(alloc, gb);
-        ggml_allocr_alloc_graph(alloc, gb);
+            if (!measure_only) {
                int * data = (int *) KQ_pos->data;
                for (int i = 0; i < N; ++i) {
                    data[i] = n_past + i;
                }
            }
        }
        // remove the additional nodes and leafs
        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -559,9 +521,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        copy_tensor_by_name(layer.w1,             f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
+        copy_tensor_by_name(layer.ffn_gate,       f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        copy_tensor_by_name(layer.w2,             f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
+        copy_tensor_by_name(layer.ffn_down,       f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        copy_tensor_by_name(layer.w3,             f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
+        copy_tensor_by_name(layer.ffn_up,         f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
    }
 }
@ -702,9 +664,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
        gguf_add_tensor(fctx, layer.wv);
        gguf_add_tensor(fctx, layer.wo);
        gguf_add_tensor(fctx, layer.ffn_norm);
-        gguf_add_tensor(fctx, layer.w1);
+        gguf_add_tensor(fctx, layer.ffn_gate);
-        gguf_add_tensor(fctx, layer.w2);
+        gguf_add_tensor(fctx, layer.ffn_down);
-        gguf_add_tensor(fctx, layer.w3);
+        gguf_add_tensor(fctx, layer.ffn_up);
    }
 }
@ -953,9 +915,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
        nx += ggml_nelements(layer.wv);
        nx += ggml_nelements(layer.wo);
        nx += ggml_nelements(layer.ffn_norm);
-        nx += ggml_nelements(layer.w1);
+        nx += ggml_nelements(layer.ffn_gate);
-        nx += ggml_nelements(layer.w2);
+        nx += ggml_nelements(layer.ffn_down);
-        nx += ggml_nelements(layer.w3);
+        nx += ggml_nelements(layer.ffn_up);
    }
    return nx;
 }
@ -998,7 +960,7 @@ int main(int argc, char ** argv) {
    struct ggml_opt_context * opt   = train->opt;
    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
@ -1046,7 +1008,7 @@ int main(int argc, char ** argv) {
    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + model.data.size()), (float) (ggml_used_mem(model.ctx) + model.data.size()) / (1024.0f*1024.0f));
+    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
    if (params.only_write_model) {
        save_train_files_data save_data;
@ -1073,11 +1035,6 @@ int main(int argc, char ** argv) {
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;
    std::vector<uint8_t> mem_input_data;
    std::vector<uint8_t> mem_compute_data;
    ggml_allocr * alloc = NULL;
    // context for input tensors without their data
    struct ggml_init_params ctx_input_params = {
        ggml_tensor_overhead() * 2, // mem_size
@ -1091,16 +1048,10 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
    // measure required memory for input tensors
    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // allocate input tensors
-    mem_input_data.resize(max_input_size);
+    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    ggml_allocr_alloc(alloc, tokens_input);
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    ggml_allocr_alloc(alloc, target_probs);
    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1127,7 +1078,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1140,9 +1091,10 @@ int main(int argc, char ** argv) {
            &logits, tokens_input, target_probs,
            n_tokens, n_batch,
            params.common.use_flash,
-            params.common.use_checkpointing
+            params.common.use_checkpointing,
            true
        );
-        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
        if (max_compute_size < best_compute_size) {
            best_compute_size = max_compute_size;
            best_order = gf->order;
@ -1157,9 +1109,8 @@ int main(int argc, char ** argv) {
        "invalid");
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1172,7 +1123,8 @@ int main(int argc, char ** argv) {
        &logits, tokens_input, target_probs,
        n_tokens, n_batch,
        params.common.use_flash,
-        params.common.use_checkpointing
+        params.common.use_checkpointing,
        false
    );
    std::vector<llama_token> train_tokens;
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1706732774,
+        "lastModified": 1708118438,
-        "narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=",
+        "narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d",
+        "rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -150,6 +150,7 @@
            packages =
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
                vulkan = config.packages.default.override { useVulkan = true; };
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
@ -157,7 +158,6 @@
                mpi-cpu = config.packages.default.override { useMpi = true; };
                mpi-cuda = config.packages.default.override { useMpi = true; };
                vulkan = config.packages.default.override { useVulkan = true; };
              }
              // lib.optionalAttrs (system == "x86_64-linux") {
                rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -6,88 +6,62 @@
 extern "C" {
 #endif
-struct ggml_backend;
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-struct ggml_backend_buffer;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-struct ggml_backend_buffer_type;
+typedef struct ggml_backend * ggml_backend_t;
 //
 // Legacy API
 //
 typedef struct ggml_allocr * ggml_allocr_t;
 // initialize allocator for use with CPU backend only
 GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
 // initialize allocator for use with ggml-backend
 GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
 GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
 GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
 GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
 GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
 GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
 GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
 GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
 //
 // ggml-backend v2 API
 //
 // Separate tensor and graph allocator objects
 // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
 // The original API is kept as a wrapper around the new API
 // Tensor allocator
 typedef struct ggml_tallocr * ggml_tallocr_t;
-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
+GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
+GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
 GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
 GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
 GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
 GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
 GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
 GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
 GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
 GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
 // Graph allocator
 /*
  Example usage:
    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
    ggml_gallocr_reserve(galloc, build_graph(max_batch));
    // allocate the graph
    struct ggml_cgraph * graph = build_graph(batch);
    ggml_gallocr_alloc_graph(galloc, graph);
    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
    // evaluate the graph
    ggml_backend_graph_compute(backend, graph);
 */
 // special tensor flags for use with the graph allocator:
 //   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
 //   ggml_set_output(): output tensors are never freed and never overwritten
 typedef struct ggml_gallocr * ggml_gallocr_t;
-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
-GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
 GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
-GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
+// call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
 GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
 GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
-// Allocate tensors from the allocators given by the hash table
+// automatic reallocation if the topology changes when using a single buffer
-GGML_API void   ggml_gallocr_alloc_graph_n(
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
-                    ggml_gallocr_t galloc,
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
                    struct ggml_cgraph * graph,
                    struct ggml_hash_set hash_set,
                    ggml_tallocr_t * hash_node_talloc);
 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 #ifdef  __cplusplus
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -83,8 +83,9 @@ extern "C" {
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
@ -129,11 +130,7 @@ extern "C" {
        // in build_graph:
        build_graph(...) {
-            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
+            // manually assign nodes to a backend (optional, should not be needed in most cases)
            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
            ggml_allocr_alloc(alloc_cpu, tensor);
            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
        }
@ -163,20 +160,19 @@ extern "C" {
    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API void                  ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    // Get the number of splits of the last graph
    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
-    GGML_API void                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
+    // Reset all assignments and allocators - must be called before changing the node backends
    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -53,11 +53,23 @@ extern "C" {
 //
 #include <arm_neon.h>
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) (x)
+
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    __fp16 tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    __fp16 tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #else
@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+#if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 #define GGML_HASHTABLE_FULL ((size_t)-1)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -61,6 +61,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
    GGML_METAL_KERNEL_TYPE_RMS_NORM,
    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
@ -83,6 +86,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
@ -101,6 +107,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
@ -116,6 +125,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
@ -131,6 +143,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F16,
    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
@ -176,7 +191,7 @@ struct ggml_metal_context {
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-//static NSString * const msl_library_source = @"see metal.metal";
+// static NSString * const msl_library_source = @"see metal.metal";
 // Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
@ -272,6 +287,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                return NULL;
            }
        } else {
 #if GGML_METAL_EMBED_LIBRARY
            GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
            extern const char ggml_metallib_start[];
            extern const char ggml_metallib_end[];
            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
            NSString * sourcePath;
@ -294,6 +317,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
 #endif
            @autoreleasepool {
                // dictionary of preprocessor macros
@ -433,6 +457,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,          get_rows_iq3_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S,            get_rows_iq3_s,         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,            get_rows_iq1_s,         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,           get_rows_iq4_nl,        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
@ -455,6 +482,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,        mul_mv_iq3_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32,          mul_mv_iq3_s_f32,       ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,          mul_mv_iq1_s_f32,       ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,         mul_mv_iq4_nl_f32,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
@ -473,6 +503,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,     mul_mv_id_iq3_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32,       mul_mv_id_iq3_s_f32,    ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,       mul_mv_id_iq1_s_f32,    ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,      mul_mv_id_iq4_nl_f32,   ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
@ -488,6 +521,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,        mul_mm_iq3_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32,          mul_mm_iq3_s_f32,       ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,          mul_mm_iq1_s_f32,       ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,         mul_mm_iq4_nl_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
@ -503,6 +539,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,     mul_mm_id_iq3_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,       mul_mm_id_iq3_s_f32,    ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,       mul_mm_id_iq1_s_f32,    ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,      mul_mm_id_iq4_nl_f32,   ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
@ -687,6 +726,7 @@ static bool ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    @autoreleasepool {
    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
    edesc.dispatchType = MTLDispatchTypeSerial;
@ -727,6 +767,7 @@ static bool ggml_metal_graph_compute(
        size_t offs_src0 = 0;
        size_t offs_src1 = 0;
        size_t offs_src2 = 0;
        size_t offs_dst  = 0;
        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
@ -745,6 +786,7 @@ static bool ggml_metal_graph_compute(
            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
            struct ggml_tensor * dst  = gf->nodes[i];
            switch (dst->op) {
@ -806,6 +848,7 @@ static bool ggml_metal_graph_compute(
            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
            id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
@ -1187,7 +1230,16 @@ static bool ggml_metal_graph_compute(
                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
                        }
-                        const float scale = ((float *) dst->op_params)[0];
+                        const float scale    = ((float *) dst->op_params)[0];
                        const float max_bias = ((float *) dst->op_params)[1];
                        const int64_t nrows_x = ggml_nrows(src0);
                        const int64_t nrows_y = src0->ne[1];
                        const uint32_t n_head_kv   = nrows_x/nrows_y;
                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
@ -1196,11 +1248,20 @@ static bool ggml_metal_graph_compute(
                        } else {
                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
                        }
-                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                        if (id_src2) {
-                        [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                            [encoder setBuffer:id_src2 offset:offs_src2   atIndex:2];
-                        [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                        } else {
-                        [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:2];
-                        [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                        }
                        [encoder setBuffer:id_dst   offset:offs_dst          atIndex:3];
                        [encoder setBytes:&ne00     length:sizeof(ne00)      atIndex:4];
                        [encoder setBytes:&ne01     length:sizeof(ne01)      atIndex:5];
                        [encoder setBytes:&ne02     length:sizeof(ne02)      atIndex:6];
                        [encoder setBytes:&scale    length:sizeof(scale)     atIndex:7];
                        [encoder setBytes:&max_bias length:sizeof(max_bias)  atIndex:8];
                        [encoder setBytes:&m0       length:sizeof(m0)        atIndex:9];
                        [encoder setBytes:&m1       length:sizeof(m1)        atIndex:10];
                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -1296,6 +1357,9 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32  ].pipeline; break;
                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                            }
@ -1430,6 +1494,24 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ3_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ1_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ4_NL:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@ -1464,7 +1546,7 @@ static bool ggml_metal_graph_compute(
                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
-                                src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@ -1472,11 +1554,16 @@ static bool ggml_metal_graph_compute(
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src0t == GGML_TYPE_IQ3_XXS) {
+                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = 256*4+128;
+                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ4_NL) {
                                const int mem_size = 32*sizeof(float);
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
@ -1513,8 +1600,6 @@ static bool ggml_metal_graph_compute(
                        // max size of the src1ids array in the kernel stack
                        GGML_ASSERT(ne11 <= 512);
                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
@ -1572,6 +1657,9 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
                                case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                            }
@ -1709,6 +1797,24 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ3_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ1_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ4_NL:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@ -1759,7 +1865,7 @@ static bool ggml_metal_graph_compute(
                            if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
                                src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
-                                src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
+                                src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
@ -1767,11 +1873,16 @@ static bool ggml_metal_graph_compute(
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
-                            else if (src2t == GGML_TYPE_IQ3_XXS) {
+                            else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) {
-                                const int mem_size = 256*4+128;
+                                const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src2t == GGML_TYPE_IQ4_NL) {
                                const int mem_size = 32*sizeof(float);
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src2t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
@ -1813,6 +1924,9 @@ static bool ggml_metal_graph_compute(
                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S  ].pipeline; break;
                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
                            default: GGML_ASSERT(false && "not implemented");
                        }
@ -2148,8 +2262,8 @@ static bool ggml_metal_graph_compute(
                        id<MTLComputePipelineState> pipeline = nil;
                        switch (order) {
-                            case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                            case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                            case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
+                            case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
                            default: GGML_ASSERT(false);
                        };
@ -2272,6 +2386,7 @@ static bool ggml_metal_graph_compute(
        [[MTLCaptureManager sharedCaptureManager] stopCapture];
    }
    }
    return true;
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
 }
 void ggml_cl_free_data(const struct ggml_tensor* tensor) {
-    if (tensor->backend != GGML_BACKEND_GPU) {
+    if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
        return;
    }
@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
 }
 static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
 }
 static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t y_size;
    size_t d_size;
    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
+    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
        d_X = (cl_mem) src0->extra;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
-    cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D =  dst->backend == GGML_BACKEND_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_D =  dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
    size_t x_offset = 0;
@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
        // TODO: copy src0 here when r3>1
        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
+                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
                    x_offset = (i03 * ne02 + i02) * x_ne;
                } else {
                    // copy src0 to device
@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                    // copy src1 to device
-                    if (src1->backend == GGML_BACKEND_CPU) {
+                    if (src1->backend == GGML_BACKEND_TYPE_CPU) {
                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
                    }
@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                    }
                    // copy dst to host
-                    if (dst->backend == GGML_BACKEND_CPU) {
+                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
                    }
@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
        }
    }
-    if (src0->backend != GGML_BACKEND_GPU) {
+    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
-    if (src1->backend != GGML_BACKEND_GPU) {
+    if (src1->backend != GGML_BACKEND_TYPE_GPU) {
        ggml_cl_pool_free(d_Y, y_size);
    }
-    if (dst->backend != GGML_BACKEND_GPU) {
+    if (dst->backend != GGML_BACKEND_TYPE_GPU) {
        ggml_cl_pool_free(d_D, d_size);
    }
 }
@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t y_size;
    size_t d_size;
    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
+    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
        d_X = (cl_mem) src0->extra;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
        // TODO: copy src0 here when r3>1
        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
+                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
                    x_offset = (i03 * ne02 + i02) * x_ne;
                } else {
                    // copy src0 to device
@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                    }
                    // copy dst to host, then convert to float
-                    if (dst->backend == GGML_BACKEND_CPU) {
+                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
        }
    }
-    if (src0->backend != GGML_BACKEND_GPU) {
+    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
    ggml_cl_pool_free(d_Y, y_size);
@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
    cl_mem d_Q;
-    if (src0->backend == GGML_BACKEND_CPU) {
+    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
        d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
    }
@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
            for (int64_t i02 = 0; i02 < ne02; i02++) {
                // copy src0 to device if necessary
-                if (src0->backend == GGML_BACKEND_CPU) {
+                if (src0->backend == GGML_BACKEND_TYPE_CPU) {
                    events.emplace_back();
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                } else if (src0->backend == GGML_BACKEND_GPU) {
+                } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
                    d_Q = (cl_mem) src0->extra;
                } else {
                    GGML_ASSERT(false);
@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                if (!mul_mat_vec) {
                    // convert src0 to fp32 on device
                    const size_t global = x_ne / global_denom;
-                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                    const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                        // compute
                        const size_t global = ne01 * local;
-                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                        const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                        const cl_int ncols = ne00;
                        events.emplace_back();
                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    }
    ggml_cl_pool_free(d_Y, y_size);
    ggml_cl_pool_free(d_D, d_size);
-    if (src0->backend == GGML_BACKEND_CPU) {
+    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
        ggml_cl_pool_free(d_Q, q_size);
    }
 }
@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
        return true;
    }
@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    CL_CHECK(clFinish(queue));
    tensor->extra = dst;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
 }
 // ggml-backend
@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
        ctx->sub_buffers.push_back(sub_buffer);
        tensor->extra = sub_buffer;
    }
-    tensor->backend = GGML_BACKEND_GPU;
+    tensor->backend = GGML_BACKEND_TYPE_GPU;
 }
 static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -191,6 +191,36 @@ typedef struct {
 } block_iq3_xxs;
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 // 3.4375 bpw
 #if QK_K == 64
 #define IQ3S_N_SCALE 2
 #else
 #define IQ3S_N_SCALE QK_K/64
 #endif
 typedef struct {
    ggml_fp16_t d;
    uint8_t qs[QK_K/4];
    uint8_t qh[QK_K/32];
    uint8_t signs[QK_K/8];
    uint8_t scales[IQ3S_N_SCALE];
 } block_iq3_s;
 static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
 typedef struct {
    ggml_fp16_t d;
    uint8_t qs[QK_K/8];
    uint8_t scales[QK_K/16];
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 // Non-linear quants
 #define QK4_NL 32
 typedef struct {
    ggml_fp16_t d;
    uint8_t qs[QK4_NL/2];
 } block_iq4_nl;
 static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -210,6 +240,8 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
 void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
 void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
 void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
 void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int k);
 void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int k);
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@ -225,6 +257,8 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@ -243,22 +277,28 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 //
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@ -266,6 +306,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML
 size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq1_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq3_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@ -276,8 +319,8 @@ size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-void iq2xs_init_impl(int grid_size);
+void iq2xs_init_impl(enum ggml_type type);
-void iq2xs_free_impl(int grid_size);
+void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
    size_t total_elements = ggml_nelements(src);
-    const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    float *src_data =NULL;
    if(src_on_device) {
        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *)  src->extra;
@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
            int ixj = col ^ j;
            if (ixj > col) {
                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
                        swap(dst_row[col], dst_row[ixj]);
                    }
                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                    if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
                        swap(dst_row[col], dst_row[ixj]);
                    }
                }
@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
    }
 }
-static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
+template <int qk, int qi, typename block_q_t, int vdr,
-                                       float *dst, const int ncols,
+          vec_dot_q_sycl_t vec_dot_q_sycl>
-                                       const int nrows,
+static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
-                                       dpct::queue_ptr stream) {
+                                         float *dst, const int ncols,
-    GGML_ASSERT(ncols % QK4_0 == 0);
+                                         const int nrows,
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+                                         dpct::queue_ptr stream) {
-    const sycl::range<3> block_nums(1, 1, block_num_y);
+  GGML_ASSERT(ncols % QK4_0 == 0);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    stream->parallel_for(
+  const sycl::range<3> block_nums(1, 1, block_num_y);
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+  stream->parallel_for(
-            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+      sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
-                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+  ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                                             item_ct1);
+        mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
-        });
+            vx, vy, dst, ncols, nrows, item_ct1);
-}
+      });
 static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK4_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK5_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK5_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK8_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 int get_device_index_by_id(int id){
@ -10977,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
    const sycl::range<3> block_dims(1, 1, ncols);
    const sycl::range<3> block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ASC) {
+    if (order == GGML_SORT_ORDER_ASC) {
        /*
        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
        the limit. To get the device limit, query
@ -10986,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
        stream->parallel_for(
            sycl::nd_range<3>(block_nums * block_dims, block_dims),
            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
+                k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(x, dst, ncols, item_ct1);
            });
-    } else if (order == GGML_SORT_DESC) {
+    } else if (order == GGML_SORT_ORDER_DESC) {
        /*
        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
        the limit. To get the device limit, query
@ -10997,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
        stream->parallel_for(
            sycl::nd_range<3>(block_nums * block_dims, block_dims),
            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
+                k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(x, dst, ncols, item_ct1);
            });
    } else {
        GGML_ASSERT(false);
@ -11559,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
    dpct::memcpy_direction kind;
    char * src_ptr;
-    if (src->backend == GGML_BACKEND_CPU) {
+    if (src->backend == GGML_BACKEND_TYPE_CPU) {
        kind = dpct::host_to_device;
        src_ptr = (char *) src->data;
-        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
+        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
-    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
+    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
        kind = dpct::device_to_device;
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
        int id;
@ -11578,11 +11426,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
    }
    char * dst_ptr = (char *) dst;
-    const int64_t ne0 = src->ne[0];
+    GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
-    const int64_t nb0 = src->nb[0];
+    GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
    const int64_t nb1 = src->nb[1];
    const int64_t nb2 = src->nb[2];
    const int64_t nb3 = src->nb[3];
    const enum ggml_type type = src->type;
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
@ -12001,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q(
    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
@ -12098,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
    // TODO: support these quantization types
    GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
                  src0->type == GGML_TYPE_IQ2_XS ||
                  src0->type == GGML_TYPE_IQ3_XXS ||
                  src0->type == GGML_TYPE_IQ1_S));
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
-            break;
+                                       VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
-            break;
+                                       VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
-            break;
+                                       VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
-            break;
+                                       VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
-            break;
+                                       VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
-            break;
+                                       VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
-            break;
+                                       VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
-            break;
+                                       VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
-            break;
+                                       VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
-            break;
+                                       VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        default:
            GGML_ASSERT(false);
            break;
@ -12148,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
    const int64_t src1_ncols, const int64_t src1_padded_row_size,
    const dpct::queue_ptr &stream) {
-    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_TENSOR_BINARY_OP_LOCALS;
    const int64_t row_diff = row_high - row_low;
@ -12248,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
    // the main device has a larger memory buffer to hold the results from all GPUs
    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
+    int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff;
 #ifdef GGML_SYCL_F16
    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
@ -12426,9 +12297,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    const int64_t ne00 = src0->ne[0];
+    GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t nrows = ggml_nrows(src0);
    //const int n_past = ((int32_t *) dst->op_params)[0];
@ -12632,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
    const bool use_src1 = src1 != nullptr;
    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_on_device =             src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
    // dd = data device
    float * src0_ddf = nullptr;
@ -12696,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
    }
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        SYCL_CHECK(CHECK_TRY_ERROR(
            dpct::get_current_device().queues_wait_and_throw()));
    }
@ -12758,15 +12627,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                                 ggml_sycl_op_mul_mat_t op,
                                 const bool convert_src1_to_q8_1) try {
-    const int64_t ne00 = src0->ne[0];
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nrows1 = ggml_nrows(src1);
    GGML_ASSERT(ne03 == ne13);
@ -12777,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
    const int nb2 = dst->nb[2];
    const int nb3 = dst->nb[3];
-    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@ -12793,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    const bool src0_is_contiguous = ggml_is_contiguous(src0);
    const bool src1_is_contiguous = ggml_is_contiguous(src1);
    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    GGML_ASSERT(!(split && ne02 > 1));
    GGML_ASSERT(!(split && ne03 > 1));
    GGML_ASSERT(!(split && ne02 < ne12));
@ -12854,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
        used_devices++;
-        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+        const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
        ggml_sycl_set_device(get_device_id_by_index(id));
        const dpct::queue_ptr stream = g_syclStreams[id][0];
@ -12919,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                continue;
            }
-            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+            const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index;
            const int64_t row_diff = row_high[id] - row_low[id];
            ggml_sycl_set_device(get_device_id_by_index(id));
@ -12946,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                // the main device memory buffer can be on VRAM scratch, with space for all partial results
                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
+                if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) {
                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
                }
                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
                    if (id != g_main_device_index) {
                        if (convert_src1_to_q8_1) {
                            char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
@ -12967,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                                src1_ncols * ne10 * sizeof(float))));
                        }
                    }
-                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                } else {
                    GGML_ASSERT(false);
                }
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                    /*
                    DPCT1010:92: SYCL uses exceptions to report errors and does
@ -13004,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                if (!dst_on_device) {
                    void * dst_off_device;
                    dpct::memcpy_direction kind;
-                    if (dst->backend == GGML_BACKEND_CPU) {
+                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                        dst_off_device = dst->data;
                        kind = dpct::device_to_host;
-                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                    } else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
                        dst_off_device = dst_extra->data_device[g_main_device_index];
                        kind = dpct::device_to_device;
                    } else {
@ -13091,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
        }
    }
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
        SYCL_CHECK(CHECK_TRY_ERROR(
            dpct::get_current_device().queues_wait_and_throw()));
@ -13228,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
                                       const ggml_tensor *src1,
                                       ggml_tensor *dst) try {
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -13266,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -13333,27 +13196,17 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
-    const int64_t nb01 = src0->nb[1];
+    GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
-    const int64_t ne10 = src1->ne[0];
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
-    const int64_t nb11 = src1->nb[1];
+    GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);
@ -13519,11 +13372,11 @@ catch (sycl::exception const &exc) {
 static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_GPU) &&
+        (src1->backend == GGML_BACKEND_TYPE_GPU) &&
-        ( dst->backend == GGML_BACKEND_GPU);
+        ( dst->backend == GGML_BACKEND_TYPE_GPU);
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
    int64_t min_compute_capability = INT_MAX;
    for (int64_t id = 0; id < g_device_count; ++id) {
@ -13652,26 +13505,18 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src00));
    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
+    GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
    const int64_t ne01 = src00->ne[1];
    const int64_t ne02 = src00->ne[2];
    const int64_t ne03 = src00->ne[3];
    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
+    GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-    const int64_t ne10 = src1->ne[0];
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
    //const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);
@ -13798,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-    if (ids->backend == GGML_BACKEND_GPU) {
+    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
        SYCL_CHECK(CHECK_TRY_ERROR(
            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
@ -13816,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;
-    src1_row.backend = GGML_BACKEND_GPU;
+    src1_row.backend = GGML_BACKEND_TYPE_GPU;
-    dst_row.backend  = GGML_BACKEND_GPU;
+    dst_row.backend  = GGML_BACKEND_TYPE_GPU;
    src1_row.extra = &src1_row_extra;
    dst_row.extra = &dst_row_extra;
-    char * src1_original = src1->backend == GGML_BACKEND_CPU ?
+    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
        (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
-    char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
+    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device_index];
    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
+        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            //int32_t row_id;
@ -13911,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
        }
    }
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
    }
 }
@ -13934,31 +13779,13 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
    const int64_t ne = ggml_nelements(src0);
    GGML_ASSERT(ne == ggml_nelements(src1));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-    const int64_t ne00 = src0->ne[0];
+    GGML_TENSOR_BINARY_OP_LOCALS;
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t nb00 = src0->nb[0];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
    const int64_t nb03 = src0->nb[3];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t nb10 = src1->nb[0];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2];
    const int64_t nb13 = src1->nb[3];
    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
@ -14060,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
    memset(extra, 0, sizeof(*extra));
    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
+        if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) {
            continue;
        }
        ggml_sycl_set_device(get_device_id_by_index(id));
        const dpct::queue_ptr stream = g_syclStreams[id][0];
        int64_t row_low, row_high;
-        if (backend == GGML_BACKEND_GPU) {
+        if (backend == GGML_BACKEND_TYPE_GPU) {
            row_low = 0;
            row_high = nrows;
-        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+        } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
            const int64_t rounding = get_row_rounding(tensor->type);
            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
@ -14119,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
        extra->data_device[id] = buf;
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
+        if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
                SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
                                                new sycl::event()));
@ -14136,7 +13963,7 @@ catch (sycl::exception const &exc) {
 }
 void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
-    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) {
        return;
    }
@ -14189,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
        return;
    }
-    tensor->backend = GGML_BACKEND_GPU;
+    tensor->backend = GGML_BACKEND_TYPE_GPU;
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
            ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
        }
    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) {
        ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
    }
@ -14215,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
        size_t offset = 0;
@ -14284,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
    const bool inplace = tensor->view_src != nullptr;
-    if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
+    if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) {
        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
        size_t view_offset = 0;
@ -14305,7 +14132,7 @@ catch (sycl::exception const &exc) {
 }
 void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    GGML_ASSERT(ggml_is_contiguous(tensor));
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -14392,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
    if (!g_sycl_loaded) return false;
    ggml_sycl_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
        return false;
@ -14532,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
            return false;
    }
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
        ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
    }
    if (params->ith != 0) {
        return true;
    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return true;
    }
    func(tensor->src[0], tensor->src[1], tensor);
@ -14690,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
    extra->data_device[ctx->device] = tensor->data;
-    tensor->backend = GGML_BACKEND_GPU;
+    tensor->backend = GGML_BACKEND_TYPE_GPU;
    tensor->extra = extra;
    if (ggml_is_quantized(tensor->type)) {
@ -14721,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                ggml_tensor *tensor,
                                                const void *data, size_t offset,
                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@ -14746,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                const ggml_tensor *tensor,
                                                void *data, size_t offset,
                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@ -14815,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
 static ggml_backend_buffer_t
 ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                           size_t size) try {
-    int device = (int) (intptr_t) buft->context;
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
    int device = (int) buft_ctx->device;
    ggml_sycl_set_device(device);
    int device_index = get_device_index_by_id(device);
@ -14893,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
        for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
            ggml_backend_sycl_buffer_types[i] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
            };
        }
        ggml_backend_sycl_buffer_type_initialized = true;
@ -14955,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
 // backend
 struct ggml_backend_context_sycl {
    int device;
 };
 static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
    return GGML_SYCL_NAME;
@ -14966,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
 }
 static void ggml_backend_sycl_free(ggml_backend_t backend) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    delete sycl_ctx;
    delete backend;
 }
 static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    return ggml_backend_sycl_buffer_type(sycl_ctx->device);
 }
@ -14982,10 +14806,10 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                               ggml_tensor *tensor,
                                               const void *data, size_t offset,
                                               size_t size) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
        (char *)tensor->data + offset, data, size)));
@ -15000,10 +14824,10 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                               const ggml_tensor *tensor,
                                               void *data, size_t offset,
                                               size_t size) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
        data, (const char *)tensor->data + offset, size)));
@ -15015,7 +14839,7 @@ catch (sycl::exception const &exc) {
 }
 static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
@ -15051,12 +14875,12 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
 }
 static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    ggml_sycl_set_main_device(sycl_ctx->device);
    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
+    params.type = GGML_TASK_TYPE_COMPUTE;
    params.ith = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];
@ -15064,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
            continue;
-        assert(node->backend == GGML_BACKEND_GPU);
+        assert(node->backend == GGML_BACKEND_TYPE_GPU);
        assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
        assert(node->extra != nullptr);
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+                assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU);
                assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
                assert(node->src[j]->extra != nullptr);
            }
@ -15140,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
                    return false;
                }
                if (a->type == GGML_TYPE_IQ1_S) {
                    return false;
                }
                if (a->type == GGML_TYPE_IQ3_XXS) {
                  return false;
                }
                if (a->type == GGML_TYPE_IQ2_XXS) {
                    return false;
                }
@ -15259,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
    // not strictly necessary, but it may reduce the overhead of the first graph_compute
    ggml_sycl_set_main_device(device);
-    ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
+    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
-        /* .device = */ device
+        /* .device = */ device,
        /* .name   = */ GGML_SYCL_NAME + std::to_string(device),
    };
    ggml_backend_t sycl_backend = new ggml_backend {
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -27,6 +27,7 @@
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
 #define VK_VENDOR_ID_AMD 0x1002
 #define VK_VENDOR_ID_APPLE 0x106b
 #define VK_VENDOR_ID_INTEL 0x8086
 #define VK_VENDOR_ID_NVIDIA 0x10de
@ -773,9 +774,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
    q.cmd_buffer_idx = 0;
 }
-static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
+static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
    for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
        vk::MemoryType memory_type = mem_props->memoryTypes[i];
        if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
            (flags & memory_type.propertyFlags) == flags &&
            mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
            return static_cast<int32_t>(i);
        }
    }
    return UINT32_MAX;
 }
 static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
 #ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl;
+    std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
 #endif
    vk_buffer buf = std::make_shared<vk_buffer_struct>();
@ -802,15 +815,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
    uint32_t memory_type_index = UINT32_MAX;
-    for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
+    memory_type_index = find_properties(&mem_props, &mem_req, req_flags);
-        vk::MemoryType memory_type = mem_props.memoryTypes[i];
+    buf->memory_property_flags = req_flags;
-        if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) {
+
-            memory_type_index = i;
+    if (memory_type_index == UINT32_MAX && fallback_flags) {
-            break;
+        memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags);
-        }
+        buf->memory_property_flags = fallback_flags;
    }
-    if (memory_type_index >= mem_props.memoryTypeCount) {
+    if (memory_type_index == UINT32_MAX) {
        ctx->device->device.destroyBuffer(buf->buffer);
        buf->size = 0;
        throw vk::OutOfDeviceMemoryError("No suitable memory type found");
@ -824,10 +837,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
        buf->size = 0;
        throw e;
    }
    buf->memory_property_flags = req_flags;
    buf->ptr = nullptr;
-    if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
+    if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
        buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
    }
@ -844,9 +856,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
    return buf;
 }
-static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) {
+static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
    try {
-        return ggml_vk_create_buffer(ctx, size, req_flags);
+        return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags);
    } catch (const vk::SystemError& e) {
        std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@ -857,16 +869,16 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size
 static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) {
    vk_buffer buf;
    try {
        buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
    } catch (const vk::SystemError& e) {
        if (ctx->device->uma) {
            // Fall back to host memory type
-            buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+            buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
        } else {
-            std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
+            buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
            std::cerr << "ggml_vulkan: " << e.what() << std::endl;
            throw e;
        }
    } catch (const vk::SystemError& e) {
        std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
        throw e;
    }
    return buf;
@ -1155,6 +1167,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    }
 }
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 void ggml_vk_instance_init() {
    if (vk_instance_initialized) {
        return;
@ -1164,28 +1179,42 @@ void ggml_vk_instance_init() {
 #endif
    vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
    const std::vector<const char*> layers = {
 #ifdef GGML_VULKAN_VALIDATE
        "VK_LAYER_KHRONOS_validation",
 #endif
    };
    const std::vector<const char*> extensions = {
 #ifdef GGML_VULKAN_VALIDATE
        "VK_EXT_validation_features",
 #endif
    };
    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
 #ifdef GGML_VULKAN_VALIDATE
    const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
    vk::ValidationFeaturesEXT validation_features = {
        features_enable,
        {},
    };
    validation_features.setPNext(nullptr);
    instance_create_info.setPNext(&validation_features);
-    std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
+    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
-#endif
+    const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
    std::vector<const char*> layers;
    if (validation_ext) {
        layers.push_back("VK_LAYER_KHRONOS_validation");
    }
    std::vector<const char*> extensions;
    if (validation_ext) {
        extensions.push_back("VK_EXT_validation_features");
    }
    if (portability_enumeration_ext) {
        extensions.push_back("VK_KHR_portability_enumeration");
    }
    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
    if (portability_enumeration_ext) {
        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
    }
    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
    vk::ValidationFeaturesEXT validation_features;
    if (validation_ext) {
        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
        validation_features = {
            features_enable,
            {},
        };
        validation_features.setPNext(nullptr);
        instance_create_info.setPNext(&validation_features);
        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
    }
    vk_instance.instance = vk::createInstance(instance_create_info);
    memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@ -1214,7 +1243,7 @@ void ggml_vk_instance_init() {
    vk_instance_initialized = true;
 }
-void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
+static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
    GGML_ASSERT(idx < vk_instance.device_indices.size());
    size_t dev_num = vk_instance.device_indices[idx];
 #ifdef GGML_VULKAN_DEBUG
@ -1232,12 +1261,12 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
    ctx->device = ggml_vk_get_device(idx);
    if (!ctx->device->initialized) {
        ctx->device->physical_device = devices[dev_num];
-        std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
+        const std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
        bool maintenance4_support = false;
        // Check if maintenance4 is supported
-        for (auto properties : ext_props) {
+        for (const auto& properties : ext_props) {
            if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
                maintenance4_support = true;
            }
@ -1268,6 +1297,26 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
        bool fp16_storage = false;
        bool fp16_compute = false;
        for (const auto& properties : ext_props) {
            if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
                fp16_storage = true;
            } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
                fp16_compute = true;
            }
        }
        ctx->device->physical_device.getProperties2(&props2);
        ctx->device->properties = props2.properties;
        if (maintenance4_support) {
            ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
        } else {
            ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
        }
        ctx->device->vendor_id = ctx->device->properties.vendorID;
        ctx->device->subgroup_size = subgroup_props.subgroupSize;
        ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
        for (auto properties : ext_props) {
            if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
                fp16_storage = true;
@ -1505,7 +1554,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
 #endif
-    vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
+    vk_buffer buf = ggml_vk_create_buffer(ctx, size,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
    if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
@ -1651,7 +1702,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
 static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
    if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
        ggml_vk_destroy_buffer(ctx->sync_staging);
-        ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
+        ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size,
            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
    }
 }
@ -1988,8 +2041,6 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
        // Copy within the device
        ggml_backend_vk_context * ctx = src->ctx;
        VkBufferCopy bc{ src_offset, dst_offset, size };
        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
        ggml_vk_ctx_begin(ctx, subctx);
        ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
@ -2120,18 +2171,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
    return ctx->device->pipeline_matmul_f32_aligned_l->align;
 }
-static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
+static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
 #endif
    if (bit16_x && bit16_y) {
-        if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
        }
-        if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
+#ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
        return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m;
    }
    if (bit16_x && !bit16_y) {
        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
        }
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
        return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
        return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
    }
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
    return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m;
 }
 static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
    if (bit16_x && bit16_y) {
        return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m;
    }
    if (bit16_x && !bit16_y) {
        return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m;
 }
 static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
    if (bit16_x && bit16_y) {
        return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
    }
    if (bit16_x && !bit16_y) {
        return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
    }
    if (!bit16_x && bit16_y) {
        GGML_ASSERT(false);
    }
    return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
 }
 static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
 #endif
    switch (ctx->device->vendor_id) {
    case VK_VENDOR_ID_AMD:
        return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
    case VK_VENDOR_ID_APPLE:
        return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
    case VK_VENDOR_ID_INTEL:
        return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
    }
    if (bit16_x && bit16_y) {
        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
        }
        if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
@ -2143,13 +2276,13 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        return aligned ? ctx->device->pipeline_matmul_f16_aligned_l : ctx->device->pipeline_matmul_f16_l;
    }
    if (bit16_x && !bit16_y) {
-        if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
            return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
        }
-        if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
+        if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
@ -2164,13 +2297,13 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
        GGML_ASSERT(false);
    }
-    if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+    if (m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " S" << std::endl;
 #endif
        return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
    }
-    if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
+    if (m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << " M" << std::endl;
 #endif
@ -2225,13 +2358,12 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
    GGML_ASSERT(false);
 }
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type) {
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
    std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
 #endif
    const int tensor_type_size = ggml_type_size(tensor->type);
    const int dst_type_size = ggml_type_size(buffer_type);
    const uint32_t ne = ggml_nelements(tensor);
@ -2290,8 +2422,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
        src1_uma = d_Qy != nullptr;
    }
-    const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
+    const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
+    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
    const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
    const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@ -2389,7 +2521,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
    }
    if (x_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
    } else if (load_x || qx_needs_dequant) {
        if (load_x) {
            // copy data to device
@ -2404,7 +2536,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
        }
    }
    if (y_non_contig) {
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, dst->type);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
    } else if (load_y) {
        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
    }
@ -2423,7 +2555,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
    // compute
    ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21);  // NOLINT
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        // copy dst to host
        float * d = (float *) ((char *) dst->data);
        ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
@ -2476,8 +2608,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
        src1_uma = d_Qy != nullptr;
    }
-    const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
+    const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
+    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
    const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
    const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
@ -2559,14 +2691,14 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
    if (x_non_contig) {
        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
    } else if (load_x) {
        // copy data to device
        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
    }
    if (y_non_contig) {
        GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
-        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, src1->type);
+        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
    } else if (load_y) {
        ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
    }
@ -2600,7 +2732,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
            ggml_vk_sync_buffers(subctx);
            ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
-            if (dst->backend == GGML_BACKEND_CPU) {
+            if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                // copy dst to host
                float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
                ggml_vk_sync_buffers(subctx);
@ -2617,7 +2749,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
    std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",  backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
 #endif
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]);  // NOLINT
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]);  // NOLINT
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -2649,7 +2781,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
        src1_uma = d_Qy != nullptr;
    }
-    const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
+    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
    const uint64_t x_ne = ne00 * ne01 * ne02;
    const uint64_t y_ne = ne10 * ne11 * ne12;
@ -2691,7 +2823,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
    ggml_vk_sync_buffers(subctx);
    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        // copy dst to host
        float * d = (float *) dst->data;
        ggml_vk_sync_buffers(subctx);
@ -2708,7 +2840,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -2741,7 +2873,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
        src1_uma = d_Qy != nullptr;
    }
-    const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
+    const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
    const uint64_t d_ne = ne01 * ne11 * ne12;
@ -2784,7 +2916,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
    ggml_vk_sync_buffers(subctx);
    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        // copy dst to host
        float * d = (float *) dst->data;
        ggml_vk_sync_buffers(subctx);
@ -2802,7 +2934,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr
    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
           (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
           dst->type == GGML_TYPE_F32 &&
-           ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU);
+           ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
 }
 static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
@ -2820,9 +2952,9 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx,
    }
 }
-static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+// static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-
+//
-}
+// }
 static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // guaranteed to be an integer due to the check in ggml_can_repeat
@ -2854,8 +2986,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
    // TODO: support for transposed / permuted tensors
    GGML_ASSERT(nb0  == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
-    GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
    ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@ -3088,8 +3220,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
        }
    }
-    const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
+    const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma;
-    const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
+    const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
    uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
    uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
@ -3098,7 +3230,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
    vk_buffer d_D = extra->buffer_gpu.lock();
    // Workaround for tiny tensor inputs on ROPE
-    if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) {
+    if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
        y_sz = VK_WHOLE_SIZE;
    }
@ -3187,9 +3319,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
            ggml_vk_sync_buffers(subctx);
            ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
        }
-        if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
+        if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
            ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
-        } else if(dst->backend == GGML_BACKEND_CPU) {
+        } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
            // copy dst to host
            float * d = (float *) dst->data;
            ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
@ -3232,7 +3364,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
                    ggml_vk_sync_buffers(subctx);
                    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
                }
-                if (dst->backend == GGML_BACKEND_CPU) {
+                if (dst->backend == GGML_BACKEND_TYPE_CPU) {
                    // copy dst to host
                    ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
                }
@ -3259,7 +3391,8 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
-        0, 0,
+        0,
        0.0f, 0.0f,
    });
 }
@ -3273,7 +3406,8 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
-        0, 0,
+        0,
        0.0f, 0.0f,
    });
 }
@ -3387,12 +3521,12 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
 static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    int32_t * op_params = (int32_t *)dst->op_params;
-    ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ASC });
+    ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
 }
 static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    // If backend is CPU, data from src0 has to be copied off the device
-    if (dst->backend == GGML_BACKEND_CPU) {
+    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
        vk_buffer d_D = extra_src0->buffer_gpu.lock();
        ggml_vk_sync_buffers(subctx);
@ -4198,9 +4332,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
 #endif
-    const bool any_on_device = node->backend == GGML_BACKEND_GPU
+    const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
-        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU));
+        || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
    if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
        return;
@ -4320,7 +4454,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
    std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
 #endif
 #if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
+    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
    ggml_vk_test_transfer(ctx, 8192 * 1000, false);
    ggml_vk_test_transfer(ctx, 8192 * 1000, true);
@ -4417,14 +4553,16 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
        if (ctx->staging != nullptr) {
            ggml_vk_destroy_buffer(ctx->staging);
        }
-        ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
+        ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size,
            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
            vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
    }
 }
 static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
-    const bool any_on_device = node->backend == GGML_BACKEND_GPU
+    const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU
-        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU);
+        || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
    if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
        return;
@ -4571,7 +4709,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        break;
    case GGML_OP_MUL_MAT_ID:
-        ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, node);
+        //ggml_vk_mul_mat_id(ctx, ctx->compute_ctx, src0, src1, node);
        std::cerr << "ggml_vulkan: GGML_OP_MUL_MAT_ID not implemented yet." << std::endl;
        GGML_ASSERT(false);
        break;
    default:
@ -4587,7 +4727,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    last_node = true;
 #endif
-    if (node->backend == GGML_BACKEND_CPU || last_node) {
+    if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
        ggml_vk_ctx_end(ctx->compute_ctx);
        ctx->compute_ctx->exit_tensor = node;
        ctx->compute_ctx = nullptr;
@ -4595,9 +4735,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 }
 static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+    const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
    if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
        return false;
@ -4660,7 +4800,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
    if (params->ith != 0) {
        return true;
    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return true;
    }
@ -4786,13 +4926,13 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
    ctx->device->device.destroyFence(ctx->fence);
 }
-GGML_CALL int ggml_vk_get_device_count() {
+GGML_CALL static int ggml_vk_get_device_count() {
    ggml_vk_instance_init();
    return vk_instance.device_indices.size();
 }
-GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
+GGML_CALL static void ggml_vk_get_device_description(int device, char * description, size_t description_size) {
    ggml_vk_instance_init();
    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
@ -4810,7 +4950,7 @@ void ggml_vk_init_cpu_assist() {
    std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
-    for (size_t i = 0; i < ggml_vk_get_device_count(); i++) {
+    for (int i = 0; i < ggml_vk_get_device_count(); i++) {
        ggml_vk_print_gpu_info(i);
    }
    // Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
@ -4958,7 +5098,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
        extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
    }
-    tensor->backend = GGML_BACKEND_GPU;
+    tensor->backend = GGML_BACKEND_TYPE_GPU;
    tensor->extra = extra;
 }
@ -4966,7 +5106,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
 #endif
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
@ -4981,7 +5121,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
 #ifdef GGML_VULKAN_DEBUG
    std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
 #endif
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
@ -4994,7 +5134,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
 GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
    if (ggml_backend_buffer_is_vk(src->buffer)) {
        ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
        ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
@ -5211,7 +5350,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
 #endif
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -5232,7 +5371,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
 #endif
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@ -5309,7 +5448,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
    int last_node = cgraph->n_nodes - 1;
    // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
-    while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
+    while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) {
        last_node -= 1;
    }
@ -5318,7 +5457,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
    }
    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
+    params.type = GGML_TASK_TYPE_COMPUTE;
    params.ith = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];
@ -5498,7 +5637,7 @@ GGML_CALL void ggml_backend_vk_get_device_description(int device, char * descrip
 }
 GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
-    GGML_ASSERT(device < vk_instance.device_indices.size());
+    GGML_ASSERT(device < (int) vk_instance.device_indices.size());
    vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
@ -5532,6 +5671,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
    return vk_instance.device_indices.size();
 }
 // Extension availability
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
 #ifdef GGML_VULKAN_VALIDATE
    bool portability_enumeration_ext = false;
    // Check for portability enumeration extension for MoltenVK support
    for (const auto& properties : instance_extensions) {
        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
            return true;
        }
    }
    if (!portability_enumeration_ext) {
        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
    }
 #endif
    return false;
    UNUSED(instance_extensions);
 }
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
 #ifdef __APPLE__
    bool portability_enumeration_ext = false;
    // Check for portability enumeration extension for MoltenVK support
    for (const auto& properties : instance_extensions) {
        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
            return true;
        }
    }
    if (!portability_enumeration_ext) {
        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
    }
 #endif
    return false;
    UNUSED(instance_extensions);
 }
 // checks
 #ifdef GGML_VULKAN_CHECK_RESULTS
@ -5588,7 +5763,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
 static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
    void * tensor_data = tensor->data;
-    if (tensor->backend == GGML_BACKEND_GPU) {
+    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
        const size_t tensor_size = ggml_nbytes(tensor);
        tensor_data = malloc(tensor_size);
@ -5615,14 +5790,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
    std::vector<const ggml_tensor *> done;
    ggml_vk_print_graph_origin(tensor, done);
-    if (tensor->backend == GGML_BACKEND_GPU) {
+    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
        free(tensor_data);
    }
 }
 static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
    return;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
    if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
        return;
    }
@ -5660,7 +5835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
    if (params->ith != 0) {
        return;
    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
        return;
    }
@ -5697,10 +5872,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
        src0_buffer = malloc(src0_size);
        src0_clone->data = src0_buffer;
-        if (src0->backend == GGML_BACKEND_CPU) {
+        if (src0->backend == GGML_BACKEND_TYPE_CPU) {
            memcpy(src0_clone->data, src0->data, src0_size);
            memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (src0->backend == GGML_BACKEND_GPU) {
+        } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
            vk_buffer buf = extra->buffer_gpu.lock();
            uint64_t offset = extra->offset;
@ -5741,10 +5916,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
        src1_buffer = malloc(src1_size);
        src1_clone->data = src1_buffer;
-        if (src1->backend == GGML_BACKEND_CPU) {
+        if (src1->backend == GGML_BACKEND_TYPE_CPU) {
            memcpy(src1_clone->data, src1->data, src1_size);
            memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (src1->backend == GGML_BACKEND_GPU) {
+        } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
            ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
            vk_buffer buf = extra->buffer_gpu.lock();
            uint64_t offset = extra->offset;
@ -5904,7 +6079,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
    if (params->ith != 0) {
        return;
    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) {
        return;
    }
    if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
@ -5916,7 +6091,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
    void * tensor_data = tensor->data;
-    if (tensor->backend == GGML_BACKEND_GPU) {
+    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
        size_t tensor_size = ggml_nbytes(tensor);
        tensor_data = malloc(tensor_size);
@ -6050,7 +6225,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
    comp_result = nullptr;
    comp_size = 0;
-    if (tensor->backend == GGML_BACKEND_GPU) {
+    if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
        free(tensor_data);
    }
 }
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -10,6 +10,7 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
 GGML_API void ggml_vk_instance_init(void);
 GGML_API void ggml_vk_init_cpu_assist(void);
 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -315,13 +315,7 @@
 extern "C" {
 #endif
 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
 #elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
 #endif
    // convert FP16 <-> FP32
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
@ -354,6 +348,9 @@ extern "C" {
        GGML_TYPE_IQ2_XXS = 16,
        GGML_TYPE_IQ2_XS  = 17,
        GGML_TYPE_IQ3_XXS = 18,
        GGML_TYPE_IQ1_S   = 19,
        GGML_TYPE_IQ4_NL  = 20,
        GGML_TYPE_IQ3_S   = 21,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
@ -367,9 +364,9 @@ extern "C" {
    };
    enum ggml_backend_type {
-        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_TYPE_CPU = 0,
-        GGML_BACKEND_GPU = 10,
+        GGML_BACKEND_TYPE_GPU = 10,
-        GGML_BACKEND_GPU_SPLIT = 20,
+        GGML_BACKEND_TYPE_GPU_SPLIT = 20,
    };
    // model file types
@ -391,6 +388,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_S   = 18, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ4_NL  = 19, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
    };
    // available tensor operations:
@ -498,18 +498,24 @@ extern "C" {
    };
    enum ggml_object_type {
-        GGML_OBJECT_TENSOR,
+        GGML_OBJECT_TYPE_TENSOR,
-        GGML_OBJECT_GRAPH,
+        GGML_OBJECT_TYPE_GRAPH,
-        GGML_OBJECT_WORK_BUFFER
+        GGML_OBJECT_TYPE_WORK_BUFFER
    };
    enum ggml_log_level {
        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO = 4,
+        GGML_LOG_LEVEL_INFO  = 4,
        GGML_LOG_LEVEL_DEBUG = 5
    };
    enum ggml_tensor_flag {
        GGML_TENSOR_FLAG_INPUT  = 1,
        GGML_TENSOR_FLAG_OUTPUT = 2,
        GGML_TENSOR_FLAG_PARAM  = 4,
    };
    // ggml object
    struct ggml_object {
        size_t offs;
@ -543,7 +549,7 @@ extern "C" {
        // op params - allocated as int32_t for alignment
        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-        bool is_param;
+        int32_t flags;
        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
@ -567,6 +573,11 @@ extern "C" {
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
    // Abort callback
    // If not NULL, called before ggml computation
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -576,8 +587,8 @@ extern "C" {
        int n_threads;
        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
+        ggml_abort_callback abort_callback;
-        void * abort_callback_data;
+        void *              abort_callback_data;
    };
    enum ggml_cgraph_eval_order {
@ -631,9 +642,9 @@ extern "C" {
    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
    enum ggml_task_type {
-        GGML_TASK_INIT = 0,
+        GGML_TASK_TYPE_INIT = 0,
-        GGML_TASK_COMPUTE,
+        GGML_TASK_TYPE_COMPUTE,
-        GGML_TASK_FINALIZE,
+        GGML_TASK_TYPE_FINALIZE,
    };
    struct ggml_compute_params {
@ -647,6 +658,16 @@ extern "C" {
        void * wdata;
    };
    // numa strategies
    enum ggml_numa_strategy {
        GGML_NUMA_STRATEGY_DISABLED   = 0,
        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
        GGML_NUMA_STRATEGY_ISOLATE    = 2,
        GGML_NUMA_STRATEGY_NUMACTL    = 3,
        GGML_NUMA_STRATEGY_MIRROR     = 4,
        GGML_NUMA_STRATEGY_COUNT
    };
    // misc
    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
@ -657,7 +678,7 @@ extern "C" {
    GGML_API void    ggml_print_backtrace(void);
-    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
+    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
@ -1362,13 +1383,17 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-    // fused soft_max(a*scale + mask)
+    // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
    // mask is optional
    // pos is required when max_bias > 0.0f
    // max_bias = 0.0f for no ALiBi
    GGML_API struct ggml_tensor * ggml_soft_max_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * mask,
-            float                 scale);
+            struct ggml_tensor  * pos,
            float                 scale,
            float                 max_bias);
    GGML_API struct ggml_tensor * ggml_soft_max_back(
            struct ggml_context * ctx,
@ -1470,12 +1495,13 @@ extern "C" {
    // alibi position embedding
    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_head,
-            float                 bias_max);
+            float                 bias_max),
        "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
    // clamp
    // in-place, returns view(a)
@ -1623,8 +1649,8 @@ extern "C" {
    // sort rows
    enum ggml_sort_order {
-        GGML_SORT_ASC,
+        GGML_SORT_ORDER_ASC,
-        GGML_SORT_DESC,
+        GGML_SORT_ORDER_DESC,
    };
    GGML_API struct ggml_tensor * ggml_argsort(
@ -1917,8 +1943,8 @@ extern "C" {
    // optimization methods
    enum ggml_opt_type {
-        GGML_OPT_ADAM,
+        GGML_OPT_TYPE_ADAM,
-        GGML_OPT_LBFGS,
+        GGML_OPT_TYPE_LBFGS,
    };
    // linesearch methods
@ -1932,12 +1958,12 @@ extern "C" {
    // optimization return values
    enum ggml_opt_result {
-        GGML_OPT_OK = 0,
+        GGML_OPT_RESULT_OK = 0,
-        GGML_OPT_DID_NOT_CONVERGE,
+        GGML_OPT_RESULT_DID_NOT_CONVERGE,
-        GGML_OPT_NO_CONTEXT,
+        GGML_OPT_RESULT_NO_CONTEXT,
-        GGML_OPT_INVALID_WOLFE,
+        GGML_OPT_RESULT_INVALID_WOLFE,
-        GGML_OPT_FAIL,
+        GGML_OPT_RESULT_FAIL,
-        GGML_OPT_CANCEL,
+        GGML_OPT_RESULT_CANCEL,
        GGML_LINESEARCH_FAIL = -128,
        GGML_LINESEARCH_MINIMUM_STEP,
@ -2087,6 +2113,12 @@ extern "C" {
            ggml_opt_callback callback,
            void * callback_data);
    //
    // tensor flags
    //
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
    //
    // quantization
    //
@ -2273,6 +2305,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    //
    // Internal types and functions exposed for tests and benchmarks
@ -2286,7 +2319,8 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef struct {
        const char      * type_name;
@ -2298,6 +2332,7 @@ extern "C" {
        ggml_from_float_t from_float_reference;
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
        int64_t           nrows; // number of rows to process simultaneously;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/gguf-py/examples/reader.py
+++ b/gguf-py/examples/reader.py
@ -0,0 +1,45 @@
 #!/usr/bin/env python3
 import sys
 from pathlib import Path
 from gguf.gguf_reader import GGUFReader
 sys.path.insert(0, str(Path(__file__).parent.parent))
 def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.
    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """
    reader = GGUFReader(gguf_file_path)
    # List all key-value pairs in a columnized format
    print("Key-Value Pairs:")
    max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        print(f"{key:{max_key_length}} : {value}")
    print("----")
    # List all tensors
    print("Tensors:")
    tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
    print("-" * 80)
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
 if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: reader.py <path_to_gguf_file>")
        sys.exit(1)
    gguf_file_path = sys.argv[1]
    read_gguf_file(gguf_file_path)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -40,6 +40,7 @@ class Keys:
        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
        POOLING_TYPE          = "{arch}.pooling_type"
    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@ -50,6 +51,7 @@ class Keys:
        VALUE_LENGTH      = "{arch}.attention.value_length"
        LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
        LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
        CAUSAL            = "{arch}.attention.causal"
    class Rope:
        DIMENSION_COUNT      = "{arch}.rope.dimension_count"
@ -60,22 +62,25 @@ class Keys:
        SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
    class Tokenizer:
-        MODEL         = "tokenizer.ggml.model"
+        MODEL            = "tokenizer.ggml.model"
-        LIST          = "tokenizer.ggml.tokens"
+        LIST             = "tokenizer.ggml.tokens"
-        TOKEN_TYPE    = "tokenizer.ggml.token_type"
+        TOKEN_TYPE       = "tokenizer.ggml.token_type"
-        SCORES        = "tokenizer.ggml.scores"
+        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"  # for BERT-style token types
-        MERGES        = "tokenizer.ggml.merges"
+        SCORES           = "tokenizer.ggml.scores"
-        BOS_ID        = "tokenizer.ggml.bos_token_id"
+        MERGES           = "tokenizer.ggml.merges"
-        EOS_ID        = "tokenizer.ggml.eos_token_id"
+        BOS_ID           = "tokenizer.ggml.bos_token_id"
-        UNK_ID        = "tokenizer.ggml.unknown_token_id"
+        EOS_ID           = "tokenizer.ggml.eos_token_id"
-        SEP_ID        = "tokenizer.ggml.seperator_token_id"
+        UNK_ID           = "tokenizer.ggml.unknown_token_id"
-        PAD_ID        = "tokenizer.ggml.padding_token_id"
+        SEP_ID           = "tokenizer.ggml.seperator_token_id"
-        ADD_BOS       = "tokenizer.ggml.add_bos_token"
+        PAD_ID           = "tokenizer.ggml.padding_token_id"
-        ADD_EOS       = "tokenizer.ggml.add_eos_token"
+        CLS_ID           = "tokenizer.ggml.cls_token_id"
-        ADD_PREFIX    = "tokenizer.ggml.add_space_prefix"
+        MASK_ID          = "tokenizer.ggml.mask_token_id"
-        HF_JSON       = "tokenizer.huggingface.json"
+        ADD_BOS          = "tokenizer.ggml.add_bos_token"
-        RWKV          = "tokenizer.rwkv.world"
+        ADD_EOS          = "tokenizer.ggml.add_eos_token"
-        CHAT_TEMPLATE = "tokenizer.chat_template"
+        ADD_PREFIX       = "tokenizer.ggml.add_space_prefix"
        HF_JSON          = "tokenizer.huggingface.json"
        RWKV             = "tokenizer.rwkv.world"
        CHAT_TEMPLATE    = "tokenizer.chat_template"
 #
@ -84,27 +89,29 @@ class Keys:
 class MODEL_ARCH(IntEnum):
-    LLAMA     = auto()
+    LLAMA      = auto()
-    FALCON    = auto()
+    FALCON     = auto()
-    BAICHUAN  = auto()
+    BAICHUAN   = auto()
-    GPT2      = auto()
+    GPT2       = auto()
-    GPTJ      = auto()
+    GPTJ       = auto()
-    GPTNEOX   = auto()
+    GPTNEOX    = auto()
-    MPT       = auto()
+    MPT        = auto()
-    STARCODER = auto()
+    STARCODER  = auto()
-    PERSIMMON = auto()
+    PERSIMMON  = auto()
-    REFACT    = auto()
+    REFACT     = auto()
-    BERT      = auto()
+    BERT       = auto()
-    BLOOM     = auto()
+    NOMIC_BERT = auto()
-    STABLELM  = auto()
+    BLOOM      = auto()
-    QWEN      = auto()
+    STABLELM   = auto()
-    QWEN2     = auto()
+    QWEN       = auto()
-    PHI2      = auto()
+    QWEN2      = auto()
-    PLAMO     = auto()
+    PHI2       = auto()
-    CODESHELL = auto()
+    PLAMO      = auto()
-    ORION     = auto()
+    CODESHELL  = auto()
    ORION      = auto()
    INTERNLM2  = auto()
-    MINICPM   = auto()
+    MINICPM    = auto()
    GEMMA      = auto()
 class MODEL_TENSOR(IntEnum):
@ -122,6 +129,7 @@ class MODEL_TENSOR(IntEnum):
    ATTN_OUT        = auto()
    ATTN_NORM       = auto()
    ATTN_NORM_2     = auto()
    ATTN_OUT_NORM   = auto()
    ATTN_ROT_EMBD   = auto()
    FFN_GATE_INP    = auto()
    FFN_NORM        = auto()
@ -134,6 +142,7 @@ class MODEL_TENSOR(IntEnum):
    FFN_UP_EXP      = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()
    LAYER_OUT_NORM  = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -148,6 +157,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.PERSIMMON:      "persimmon",
    MODEL_ARCH.REFACT:         "refact",
    MODEL_ARCH.BERT:           "bert",
    MODEL_ARCH.NOMIC_BERT:     "nomic-bert",
    MODEL_ARCH.BLOOM:          "bloom",
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
@ -158,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.ORION:          "orion",
    MODEL_ARCH.INTERNLM2:      "internlm2",
    MODEL_ARCH.MINICPM:        "minicpm",
    MODEL_ARCH.GEMMA:          "gemma",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -178,6 +189,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
    MODEL_TENSOR.ATTN_OUT_NORM:   "blk.{bid}.attn_output_norm",
    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
@ -187,6 +199,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
    MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -262,17 +275,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    ],
    MODEL_ARCH.BERT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.TOKEN_TYPES,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.LAYER_OUT_NORM,
    ],
    MODEL_ARCH.NOMIC_BERT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.TOKEN_TYPES,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.LAYER_OUT_NORM,
    ],
    MODEL_ARCH.MPT: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -485,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.GEMMA: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_NORM,
    ],
    # TODO
 }
@ -535,6 +576,12 @@ class RopeScalingType(Enum):
    YARN   = 'yarn'
 class PoolingType(IntEnum):
    NONE = 0
    MEAN = 1
    CLS  = 2
 class GGMLQuantizationType(IntEnum):
    F32  = 0
    F16  = 1
@ -661,5 +708,7 @@ KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
 KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
 KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
--- a/Show more
+++ b/Show more