Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda

2024-01-29 13:17:39 -05:00 · 2024-01-29 13:17:39 -05:00 · 7980178a17
commit 7980178a17
parent a1d5a12bc5 5fcb9c1c5a
94 changed files with 90340 additions and 2283 deletions
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -0,0 +1,26 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 ARG UBUNTU_VERSION=22.04
 FROM intel/hpckit:$ONEAPI_VERSION as build
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
 RUN mkdir build && \
    cd build && \
    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
    cmake --build . --config Release --target main server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/build/bin/main /main
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -7,6 +7,18 @@
    { system, ... }:
    {
      _module.args = {
        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
        # again, the below creates several nixpkgs instances which the
        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
        #
        # This is currently "slow" and "expensive", on a certain scale.
        # This also isn't "right" in that this hinders dependency injection at
        # the level of flake inputs. This might get removed in the foreseeable
        # future.
        #
        # Note that you can use these expressions without Nix
        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -73,6 +73,7 @@ let
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    # Note: none of the files discarded here are visible in the sandbox or
    # affect the output hash. This also means they can be modified without
    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
-        !(builtins.any (_: _) [
+        let
          noneOf = builtins.all (x: !x);
          baseName = baseNameOf name;
        in
        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-        ]);
+          (baseName == "flake.lock")
        ];
      src = lib.cleanSource ../../.;
    };
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
    cmakeFlags =
      [
-        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
        shellHook = ''
          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
        '';
      };
      shell-extra = mkShell {
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -4,6 +4,10 @@
  llamaVersion ? "0.0.0",
 }:
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -0,0 +1,32 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
 RUN make
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 COPY --from=build /app/server /server
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -0,0 +1,25 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 ARG UBUNTU_VERSION=22.04
 FROM intel/hpckit:$ONEAPI_VERSION as build
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
 RUN mkdir build && \
    cd build && \
    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
    cmake --build . --config Release --target main server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@ -0,0 +1,45 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -0,0 +1,20 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -72,7 +72,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@ -107,7 +107,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
@ -141,7 +141,48 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose
+          ctest -L main --verbose
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
    continue-on-error: true
    steps:
      - uses: actions/checkout@v2
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
@ -202,7 +243,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  macOS-latest-cmake-ios:
    runs-on: macos-latest
@ -295,7 +336,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07
    strategy:
      matrix:
@ -394,19 +435,19 @@ jobs:
        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
        run: |
          cd build
-          ctest -C Release --verbose --timeout 900
+          ctest -L main -C Release --verbose --timeout 900
      - name: Test (Intel SDE)
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          & $sde -future -- ctest -C Release --verbose --timeout 900
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
      - name: Determine tag name
        id: tag
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -28,13 +28,18 @@ jobs:
        config:
          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -2,13 +2,20 @@ name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
 jobs:
  nix-build-aarch64:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -5,10 +5,8 @@ on:
  push:
    branches:
      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
 jobs:
  nix-eval:
--- a/.gitignore
+++ b/.gitignore
@ -27,7 +27,7 @@
 lcov-report/
 gcovr-report/
-build*/
+build*
 out/
 tmp/
@ -89,20 +89,3 @@ examples/jeopardy/results.txt
 poetry.lock
 poetry.toml
 # Test binaries
 /tests/test-grammar-parser
 /tests/test-llama-grammar
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
 /tests/test-sampling
 /tests/test-tokenizer-0-llama
 /tests/test-tokenizer-0-falcon
 /tests/test-tokenizer-1-llama
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
 /tests/test-backend-ops
 /tests/test-autorelease
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -47,6 +48,7 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@ -97,24 +99,38 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
 if (LLAMA_PERF)
    add_definitions(-DGGML_PERF)
 endif()
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
-
+if (LLAMA_SYCL)
    set(CMAKE_CXX_STANDARD 17)
 else()
    set(CMAKE_CXX_STANDARD 11)
 endif()
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@ -401,6 +417,22 @@ if (LLAMA_CLBLAST)
    endif()
 endif()
 if (LLAMA_VULKAN)
    find_package(Vulkan)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")
        add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
        target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
        add_compile_definitions(GGML_USE_VULKAN)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
    else()
        message(WARNING "Vulkan not found")
    endif()
 endif()
 if (LLAMA_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
@ -446,6 +478,32 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()
 if (LLAMA_SYCL)
    if ( NOT DEFINED ENV{ONEAPI_ROOT})
        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
    endif()
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    if (LLAMA_SYCL_F16)
        add_compile_definitions(GGML_SYCL_F16)
    endif()
    add_compile_definitions(GGML_USE_SYCL)
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -458,17 +516,24 @@ function(get_flags CCID CCVER)
            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
        )
-            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
+            list(APPEND C_FLAGS -Wdouble-promotion)
        endif()
    elseif (CCID STREQUAL "GNU")
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)
        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
+            list(APPEND CXX_FLAGS -Wno-format-truncation)
        endif()
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
+            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
    elseif (CCID MATCHES "Intel")
        if (NOT LLAMA_SYCL)
            # enable max optimization level when using Intel compiler
            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
            add_link_options(-fuse-ld=lld -static-intel)
        endif()
    endif()
@ -497,16 +562,18 @@ if (LLAMA_ALL_WARNINGS)
    endif()
 endif()
 set(CUDA_CXX_FLAGS "")
 if (LLAMA_CUBLAS)
    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
    if (NOT MSVC)
-        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
+        list(APPEND CUDA_FLAGS -Wno-pedantic)
    endif()
    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
        endif()
        execute_process(
@ -534,15 +601,10 @@ if (LLAMA_CUBLAS)
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
+        list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
        if (NOT CUDA_CXX_FLAGS STREQUAL "")
            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
    endif()
 endif()
    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 endif()
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -561,6 +623,17 @@ if (LLAMA_LTO)
    endif()
 endif()
 if (LLAMA_CCACHE)
    find_program(LLAMA_CCACHE_FOUND ccache)
    if (LLAMA_CCACHE_FOUND)
        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "Using ccache")
    else()
        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
    endif ()
 endif()
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
@ -594,12 +667,7 @@ if (NOT MSVC)
    endif()
 endif()
-function(add_compile_option_cpp ARG)
+set(ARCH_FLAGS "")
    # Adds a compile option to C/C++ only, but not for Cuda.
    # Use, e.g., for CPU-architecture flags.
    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
    add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
 endfunction()
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
    message(STATUS "ARM detected")
@ -612,19 +680,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            add_compile_options(-mfp16-format=ieee)
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            # Raspberry Pi 2
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            add_compile_options(-mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
@ -635,7 +703,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(/arch:AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
@ -649,49 +717,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
-            add_compile_option_cpp(/arch:AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
-            add_compile_option_cpp(/arch:AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if (LLAMA_NATIVE)
-            add_compile_option_cpp(-march=native)
+            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (LLAMA_F16C)
-            add_compile_option_cpp(-mf16c)
+            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (LLAMA_FMA)
-            add_compile_option_cpp(-mfma)
+            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (LLAMA_AVX)
-            add_compile_option_cpp(-mavx)
+            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (LLAMA_AVX2)
-            add_compile_option_cpp(-mavx2)
+            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(-mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512f)
-            add_compile_option_cpp(-mavx512bw)
+            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
-            add_compile_option_cpp(-mavx512vbmi)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
-            add_compile_option_cpp(-mavx512vnni)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        add_compile_options(-mcpu=powerpc64le)
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
-        add_compile_options(-mcpu=native -mtune=native)
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 else()
    message(STATUS "Unknown architecture")
 endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 if (LLAMA_CUBLAS)
    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
    endif()
    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 endif()
 if (MINGW)
    # Target Windows 8 for PrefetchVirtualMemory
    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
@ -771,6 +851,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}   ${GGML_HEADERS_SYCL}
            )
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/22
+++ b/22
@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-autorelease
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -448,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef LLAMA_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS += -lvulkan
 	OBJS    += ggml-vulkan.o
 ifdef LLAMA_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
 endif
 ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
@ -619,7 +632,7 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@ -748,5 +761,8 @@ tests/test-c.o: tests/test-c.c llama.h
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
 - ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 ----
@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support
 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 **Bindings:**
@ -121,13 +122,15 @@ as the main playground for developing new features for the [ggml](https://github
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
+- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 **UI:**
@ -596,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
  You can get a list of platforms and devices from the `clinfo -l` command, etc.
 - #### SYCL
  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
  For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
 ### Prepare Data & Run
 ```bash
@ -929,17 +941,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 * Create a folder to store big models & intermediate files (ex. /llama/models)
 #### Images
-We have two Docker images available for this project:
+We have three Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
 Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
@ -965,6 +980,12 @@ or with a light image:
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a server image:
 ```bash
 docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```
 ### Docker With CUDA
 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -974,6 +995,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
 docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
 docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
 ```
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -987,6 +1009,7 @@ The resulting images, are essentially the same as the non-CUDA images:
 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
 3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
 #### Usage
@ -995,6 +1018,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 ### Contributing
--- a/README_sycl.md
+++ b/README_sycl.md
@ -0,0 +1,252 @@
 # llama.cpp for SYCL
 [Background](#background)
 [OS](#os)
 [Intel GPU](#intel-gpu)
 [Linux](#linux)
 [Environment Variable](#environment-variable)
 [Known Issue](#known-issue)
 [Todo](#todo)
 ## Background
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
 oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 The llama.cpp for SYCL is used to support Intel GPUs.
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 ## OS
 |OS|Status|Verified|
 |-|-|-|
 |Linux|Support|Ubuntu 22.04|
 |Windows|Ongoing| |
 ## Intel GPU
 |Intel GPU| Status | Verified Model|
 |-|-|-|
 |Intel Data Center Max Series| Support| Max 1550|
 |Intel Data Center Flex Series| Support| Flex 170|
 |Intel Arc Series| Support| Arc 770|
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
 ## Linux
 ### Setup Environment
 1. Install Intel GPU driver.
 a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
 Note: for iGPU, please install the client GPU driver.
 b. Add user to group: video, render.
 ```
 sudo usermod -aG render username
 sudo usermod -aG video username
 ```
 Note: re-login to enable it.
 c. Check
 ```
 sudo apt install clinfo
 sudo clinfo -l
 ```
 Output (example):
 ```
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
 2. Install Intel® oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **/opt/intel/oneapi**.
 Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Check
 ```
 source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```
 There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```
 2. Build locally:
 ```
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
 #build all binary
 cmake --build . --config Release -v
 ```
 or
 ```
 ./examples/sycl/build.sh
 ```
 Note:
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ### Run
 1. Put model file to folder **models**
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. List device ID
 Run without parameter:
 ```
 ./build/bin/ls-sycl-device
 or
 ./build/bin/main
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute llama.cpp
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 ```
 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 or run by script:
 ```
 ./examples/sycl/run_llama2.sh
 ```
 Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
 5. Check the device ID in output
 Like：
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Environment Variable
 #### Build
 |Name|Value|Function|
 |-|-|-|
 |LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
 |LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
 |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
 |CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
 #### Running
 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 ## Known Issue
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
  Miss to enable oneAPI running environment.
  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
 - Hang during startup
  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
  Solution: add **--no-mmap**.
 ## Todo
 - Support to build in Windows.
 - Support multiple cards.
--- a/ci/README.md
+++ b/ci/README.md
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -22,9 +25,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
-rm -v $OUT/*.log
+rm -f "$OUT/*.log"
-rm -v $OUT/*.exit
+rm -f "$OUT/*.exit"
-rm -v $OUT/*.md
+rm -f "$OUT/*.md"
 sd=`dirname $0`
 cd $sd/../
@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
@ -94,7 +105,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
 }
@ -123,9 +134,9 @@ function gg_run_ctest_release {
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi
    set +e
@ -141,6 +152,61 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
    if [[ -s $gguf_3b ]]; then
        echo -n "$gguf_3b"
    elif [[ -s $gguf_7b ]]; then
        echo -n "$gguf_7b"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
    fi
 }
 function gg_run_ctest_with_model_debug {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_run_ctest_with_model_release {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_sum_ctest_with_model_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 function gg_sum_ctest_with_model_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 # open_llama_3b_v2
 function gg_run_open_llama_3b_v2 {
@ -183,8 +249,6 @@ function gg_run_open_llama_3b_v2 {
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/test-autorelease ${model_f16}
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
@ -507,14 +571,18 @@ function gg_sum_open_llama_7b_v2 {
 ## main
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
-    python3 -m pip install -r ${SRC}/requirements.txt
+    # Create a fresh python3 venv and enter it
-    python3 -m pip install --editable gguf-py
+    python3 -m venv "$MNT/venv"
    source "$MNT/venv/bin/activate"
    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
    pip install --editable gguf-py --disable-pip-version-check
 fi
 ret=0
@ -529,6 +597,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
        else
            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/common/common.cpp
+++ b/common/common.cpp
@ -42,6 +42,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
 #define GGML_USE_CUBLAS_SYCL
 #endif
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@ -203,6 +207,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
        } else if (arg == "-bf" || arg == "--binary-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i], std::ios::binary);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                invalid_param = true;
                break;
            }
            // store the external file name in params
            params.prompt_file = argv[i];
            std::ostringstream ss;
            ss << file.rdbuf();
            params.prompt = ss.str();
            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@ -582,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@ -601,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
@ -626,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
@ -653,6 +675,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@ -689,6 +717,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
        } else if (arg == "--multiple-choice") {
            params.multiple_choice = true;
        } else if (arg == "--multiple-choice-tasks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.multiple_choice_tasks = std::stoi(argv[i]);
        } else if (arg == "--kl-divergence") {
            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@ -888,6 +926,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -936,6 +976,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@ -969,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@ -1476,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@ -91,6 +91,7 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
    std::string logits_file       = "";  // file for saving *all* logits
    std::vector<llama_model_kv_override> kv_overrides;
@ -108,6 +109,11 @@ struct gpt_params {
    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
    bool   kl_divergence   = false; // compute KL-divergence
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
        // will be empty (default) if there are parse errors
        if (result->parsed_grammar.rules.empty()) {
            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
            delete result;
            return nullptr;
        }
@ -129,6 +130,8 @@ static void sampler_queue(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
    const float         top_p             = params.top_p;
    const float         min_p             = params.min_p;
@ -143,7 +146,15 @@ static void sampler_queue(
            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            case 't':
                if (dynatemp_range > 0) {
                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
                } else {
                    llama_sample_temp(ctx_main, &cur_p, temp);
                }
                break;
            default : break;
        }
    }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
    float       tfs_z                 = 1.00f;    // 1.0 = disabled
    float       typical_p             = 1.00f;    // 1.0 = disabled
    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
 import numpy as np
 import torch
@ -201,6 +201,8 @@ class Model:
            return PlamoModel
        if model_architecture == "CodeShellForCausalLM":
            return CodeShellModel
        if model_architecture == "OrionForCausalLM":
            return OrionModel
        return Model
    def _is_model_safetensors(self) -> bool:
@ -250,6 +252,8 @@ class Model:
            return gguf.MODEL_ARCH.PLAMO
        if arch == "CodeShellForCausalLM":
            return gguf.MODEL_ARCH.CODESHELL
        if arch == "OrionForCausalLM":
            return gguf.MODEL_ARCH.ORION
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -289,6 +293,58 @@ class Model:
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_qwen(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytearray] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[QwenModel.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) == 2
            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
        added_vocab = tokenizer.special_tokens
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
        for i in range(vocab_size):
            if i not in reverse_vocab:
                pad_token = f"[PAD{i}]".encode("utf-8")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.CONTROL)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        # only add special tokens when they were not already loaded from config.json
        if len(special_vocab.special_token_ids) == 0:
            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
        # this one is usually not in config.json anyway
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor
@ -487,6 +543,7 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
                if new_name is not None:
                    new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
@ -519,6 +576,83 @@ class MPTModel(Model):
                self.gguf_writer.add_tensor("output.weight", data)
 class OrionModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
        head_count = self.hparams["num_attention_heads"]
        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
        hf_repo = self.hparams.get("_name_or_path", "")
        ctx_length = 0
        if "max_sequence_length" in self.hparams:
            ctx_length = self.hparams["max_sequence_length"]
        elif "max_position_embeddings" in self.hparams:
            ctx_length = self.hparams["max_position_embeddings"]
        elif "model_max_length" in self.hparams:
            ctx_length = self.hparams["model_max_length"]
        else:
            print("gguf: can not find ctx length parameter.")
            sys.exit()
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_source_hf_repo(hf_repo)
        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
        self.gguf_writer.add_context_length(ctx_length)
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
    def write_tensors(self):
        # Collect tensors from generator object
        model_kv = dict(self.get_tensors())
        block_count = self.hparams["num_hidden_layers"]
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        for name, data_torch in model_kv.items():
            # we don't need these
            if name.endswith(".rotary_emb.inv_freq"):
                continue
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 class BaichuanModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
@ -876,6 +1010,13 @@ class PersimmonModel(Model):
 class StableLMModel(Model):
    def set_vocab(self):
        if (self.dir_model / "tokenizer.json").is_file():
            self._set_vocab_gpt2()
        else:
            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
            self._set_vocab_qwen()
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@ -904,7 +1045,7 @@ class QwenModel(Model):
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
@ -921,52 +1062,7 @@ class QwenModel(Model):
        return parts
    def set_vocab(self):
-        dir_model = self.dir_model
+        self._set_vocab_qwen()
        hparams = self.hparams
        tokens: list[bytearray] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[self.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) == 2
            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
        added_vocab = tokenizer.special_tokens
        for i in range(vocab_size):
            if i not in reverse_vocab:
                pad_token = f"[PAD{i}]".encode("utf-8")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.CONTROL)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
@ -1285,7 +1381,7 @@ def main() -> None:
    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -2,6 +2,7 @@
 from __future__ import annotations
 import argparse
 import os
 import struct
 import sys
 from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
 import numpy as np
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
+    vocab_factory = convert.VocabFactory(vocab_path)
-        cfg.vocabtype)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
                               load_merges = cfg.vocabtype == 'bpe',
                               n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab
 def handle_args():
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -5,17 +5,16 @@ import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -60,7 +59,14 @@ if __name__ == '__main__':
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
    if os.path.exists(input_model):
        model = torch.load(input_model, map_location="cpu")
    else:
        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
        # lazy import load_file only if lora is in safetensors format.
        from safetensors.torch import load_file
        model = load_file(input_model, device="cpu")
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,11 +1,13 @@
 #!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
 import sys
 import argparse
 import os
 import sys
 from pathlib import Path
 from pprint import pprint
 import torch
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -69,7 +71,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)
    arch = gguf.MODEL_ARCH.PERSIMMON
--- a/convert.py
+++ b/convert.py
@ -17,58 +17,28 @@ import signal
 import struct
 import sys
 import time
 import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
 from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
    IO,
    TYPE_CHECKING,
    Any,
    Callable,
    Iterable,
    Literal,
    Optional,
    Tuple,
    TypeVar,
 )
 import numpy as np
 from sentencepiece import SentencePieceProcessor
-try:
+if 'NO_LOCAL_GGUF' not in os.environ:
-    from transformers import AutoTokenizer
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 except ModuleNotFoundError as e:
    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
 # If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
 if "NO_LOCAL_GGUF" not in os.environ:
    # Use absolute path to the gguf-py directory
    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
    if gguf_py_dir not in sys.path:
        sys.path.insert(1, gguf_py_dir)
 # Import gguf module
 try:
 import gguf
 except ModuleNotFoundError as e:
    print(f"Could not import gguf: {e}")
    sys.exit(1)
-if TYPE_CHECKING:  # NOTE: This isn't necessary.
+if TYPE_CHECKING:
-    from typing import TypeAlias  # This can technically be omitted.
+    from typing import TypeAlias
-if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)
-# NOTE: n-dimensional arrays should be directly referenced
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 NDArray: TypeAlias = "np.ndarray[Any, Any]"
 # Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
 #
 # TODO: Clean up and refactor data types
@dataclass(frozen=True)
 class DataType:
    name: str
@ -190,57 +159,37 @@ class Params:
    n_ff:           int
    n_head:         int
    n_head_kv:      int
-    f_norm_eps: Optional[float] = None
+    n_experts:      int | None = None
-    n_experts: Optional[int] = None
+    n_experts_used: int | None = None
-    n_experts_used: Optional[int] = None
+    f_norm_eps:     float | None = None
-    rope_scaling_type: Optional[gguf.RopeScalingType] = None
+    rope_scaling_type: gguf.RopeScalingType | None = None
-    f_rope_freq_base: Optional[float] = None
+    f_rope_freq_base: float | None = None
-    f_rope_scale: Optional[float] = None
+    f_rope_scale: float | None = None
-    n_orig_ctx: Optional[int] = None
+    n_orig_ctx: int | None = None
-    rope_finetuned: Optional[bool] = None
+    rope_finetuned: bool | None = None
-    ftype: Optional[GGMLFileType] = None
+    ftype: GGMLFileType | None = None
    # path to the directory containing the model files
-    path_model: Optional[Path] = None
+    path_model: Path | None = None
    @staticmethod
-    def guessed(model: LazyModel) -> "Params":
+    def guessed(model: LazyModel) -> Params:
        # try transformer naming first
-        n_vocab, n_embd = (
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
            model["model.embed_tokens.weight"].shape
            if "model.embed_tokens.weight" in model
            else model["tok_embeddings.weight"].shape
        )
        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-                i
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-                for i in itertools.count()
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
            )
        elif (
            "model.layers.0.self_attn.W_pack.weight" in model
        ):  # next: try baichuan naming
            n_layer = next(
                i
                for i in itertools.count()
                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
            )
        else:
-            n_layer = next(
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
                i
                for i in itertools.count()
                if f"layers.{i}.attention.wq.weight" not in model
            )
        if n_layer < 1:
-            raise Exception(
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
        n_head = n_embd // 128 # guessed
        n_mult = 256           # guessed
@ -261,7 +210,7 @@ class Params:
        )
    @staticmethod
-    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@ -274,20 +223,18 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
-                rope_finetuned = rope_scaling["finetuned"]
+                rope_finetuned = rope_scaling['finetuned']
            else:
-                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception(
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
            )
        n_experts      = None
        n_experts_used = None
@ -317,7 +264,7 @@ class Params:
    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        n_experts      = None
@ -362,31 +309,31 @@ class Params:
        )
    @staticmethod
-    def load(model_plus: ModelPlus) -> "Params":
+    def load(model_plus: ModelPlus) -> Params:
        hf_config_path   = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"
        if hf_config_path.exists():
-            params = Params.load_transformers_config(model_plus.model, hf_config_path)
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.load_torch_params(model_plus.model, orig_config_path)
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        elif model_plus.format != "none":
+        elif model_plus.format != 'none':
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError("Cannot guess params when model format is none")
+            raise ValueError('Cannot guess params when model format is none')
        params.path_model = model_plus.paths[0].parent
        return params
-class BpeVocab:  # GPT
+#
-    def __init__(
+# vocab
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
+#
-    ) -> None:
+
-        self.bpe_tokenizer = json.loads(
+class BpeVocab:
-            open(str(fname_tokenizer), encoding="utf-8").read()
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        )
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        self.vocab = self.bpe_tokenizer["model"]["vocab"]
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@ -394,26 +341,23 @@ class BpeVocab:  # GPT
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
-                    (item["content"], item["id"])
+                    (item['content'], item['id'])
-                    for item in tokenizer_json.get("added_tokens", [])
+                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item["content"] not in self.bpe_tokenizer
+                    if item['content'] not in self.bpe_tokenizer)
                )
        vocab_size: int = len(self.vocab)
        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
            )
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
@ -442,10 +386,8 @@ class BpeVocab:  # GPT
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-class SentencePieceVocab:  # LlaMa
+class SentencePieceVocab:
-    def __init__(
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
    ) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@ -455,16 +397,12 @@ class SentencePieceVocab:  # LlaMa
        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-        new_tokens = {
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
            id: piece for piece, id in added_tokens.items() if id >= vocab_size
        }
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
-            raise ValueError(
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
            )
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
@ -512,11 +450,15 @@ class SentencePieceVocab:  # LlaMa
 class HfVocab:
-    def __init__(
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
-        self,
+        try:
-        fname_tokenizer: Path,
+            from transformers import AutoTokenizer
-        fname_added_tokens: Optional[Path] = None,
+        except ImportError as e:
-    ) -> None:
+            raise ImportError(
                "To use HfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
@ -555,7 +497,7 @@ class HfVocab:
        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
-    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
@ -573,11 +515,9 @@ class HfVocab:
                token_id, self.special_ids  # Reuse already stored special IDs
            )
-    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
        # Determine token type based on whether it's a special token
-        return (
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
        )
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@ -589,7 +529,6 @@ class HfVocab:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])
-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
-        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
-            rebuild_from_type_v2, "__func__"
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
-        ),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
-        ("torch._utils", "_rebuild_tensor_v2"): getattr(
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
-            lazy_rebuild_tensor_v2, "__func__"
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
-        ),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
-        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
+        ('torch', 'Tensor'): LazyTensor,
        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
        ("torch", "Tensor"): LazyTensor,
    }
    def find_class(self, module: str, name: str) -> Any:
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
 class OutputFile:
-    def __init__(
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
-        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
    ) -> None:
        self.gguf = gguf.GGUFWriter(
            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
        )
    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@ -1036,7 +967,7 @@ class OutputFile:
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = str(params.path_model.parent).split('/')[-1]
        self.gguf.add_name                (name)
        self.gguf.add_context_length      (params.n_ctx)
@ -1047,17 +978,17 @@ class OutputFile:
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
        if params.f_norm_eps is None:
            raise ValueError("f_norm_eps is None")
        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)
        if params.f_norm_eps:
            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        else:
            raise ValueError('f_norm_eps is None')
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -1089,7 +1020,7 @@ class OutputFile:
        return tokenizer_model
-    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        tokens = []
        scores = []
        toktypes = []
@ -1124,14 +1055,10 @@ class OutputFile:
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
-        data_type = (
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
        )
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
        )
    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@ -1145,12 +1072,8 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
-        fname_out: Path,
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        params: Params,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
        vocab: Vocab,
        svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
@ -1180,14 +1103,8 @@ class OutputFile:
    @staticmethod
    def write_all(
-        fname_out: Path,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-        ftype: GGMLFileType,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        params: Params,
        model: LazyModel,
        vocab: Vocab,
        svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1207,26 +1124,19 @@ class OutputFile:
        of.write_tensor_info()
        # tensor data
-        ndarrays_inner = bounded_parallel_map(
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
            OutputFile.do_item, model.items(), concurrency=concurrency
        )
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize,
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                ndarrays_inner,
                concurrency=concurrency,
                max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            zip(model.items(), ndarrays)
        ):
            elapsed = time.time() - start
-            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
-        self.files = {
+        self.files: dict[str, Path | None] = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
@ -1380,24 +1290,18 @@ class VocabFactory:
                self.files[file] = parent_file_path
        print(f"Found vocab files: {self.files}")
-    def _select_file(self, vocabtype: Optional[str]) -> Path:
+    def _select_file(self, vocabtype: str | None) -> Path:
        if vocabtype in ["spm", "bpe"]:
            for file_key in self.files.keys():
-                if self.files[file_key]:
+                if (file := self.files[file_key]) is not None:
-                    return self.files[file_key]
+                    return file
            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        elif vocabtype == "hfft":
+        if vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
        else:
        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
-    def _create_special_vocab(
+    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        self,
        vocab: Vocab,
        vocabtype: str,
        model_parent_path: Path,
    ) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
@ -1407,13 +1311,12 @@ class VocabFactory:
            n_vocab=n_vocab,
        )
-    def load_vocab(
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        self, vocabtype: str, model_parent_path: Path
    ) -> Tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")
        added_tokens_path = path.parent / "added_tokens.json"
        vocab: Vocab
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
@ -1428,6 +1331,7 @@ class VocabFactory:
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
@ -1436,7 +1340,7 @@ class VocabFactory:
        return vocab, special_vocab
-def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
@ -1446,8 +1350,7 @@ def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Pat
    if ret in model_paths:
        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n"
+            "Please explicitly specify a path using --outfile.\n")
        )
        sys.exit(1)
    return ret
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
    print(f"model_plus.format = {model_plus.format!r}")
    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
        )
-def get_argument_parser() -> ArgumentParser:
+def main(args_in: list[str] | None = None) -> None:
    output_choices = ["f32", "f16"]
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
    vocab_types = ["spm", "bpe", "hfft"]
    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
-    parser = argparse.ArgumentParser(
+    args = parser.parse_args(args_in)
        description="Convert a LLaMa model to a GGML compatible file"
    )
    parser.add_argument(
        "model",
        type=Path,
        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
    )
    parser.add_argument(
        "--awq-path",
        type=Path,
        help="Path to the Activation-aware Weight Quantization cache file",
        default=None,
    )
    parser.add_argument(
        "--dump",
        action="store_true",
        help="Display the model content without converting it",
    )
    parser.add_argument(
        "--dump-single",
        action="store_true",
        help="Display the content of a single model file without conversion",
    )
    parser.add_argument(
        "--vocab-only",
        action="store_true",
        help="Extract and output only the vocabulary",
    )
    parser.add_argument(
        "--outtype",
        choices=output_choices,
        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
    )
    parser.add_argument(
        "--vocab-dir",
        type=Path,
        help="Directory containing the tokenizer.model, if separate from the model file",
    )
    parser.add_argument(
        "--vocab-type",
        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
        default="spm",
        help="The vocabulary format used to define the tokenizer model (default: spm)",
    )
    parser.add_argument(
        "--pad-vocab",
        action="store_true",
        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
    )
    parser.add_argument(
        "--outfile",
        type=Path,
        help="Specify the path for the output file (default is based on input)",
    )
    parser.add_argument(
        "--ctx", type=int, help="Model training context (default is based on input)"
    )
    parser.add_argument(
        "--concurrency",
        type=int,
        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
        default=DEFAULT_CONCURRENCY,
    )
    parser.add_argument(
        "--big-endian",
        action="store_true",
        help="Indicate that the model is executed on a big-endian machine",
    )
    return parser
 def main(argv: Optional[list[str]] = None) -> None:
    parser = get_argument_parser()
    args = parser.parse_args(argv)
    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
            model={}, paths=[args.model / "dummy"], format="none", vocab=None
        )
    if args.dump:
        do_dump_model(model_plus)
        return
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception(
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
                            "Please specify one with --ctx:\n"
                            " - LLaMA v1: --ctx 2048\n"
-                " - LLaMA v2: --ctx 4096\n"
+                            " - LLaMA v2: --ctx 4096\n")
            )
        params.n_ctx = args.ctx
    if args.outtype:
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        OutputFile.write_vocab_only(
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-            outfile,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
            params,
            vocab,
            special_vocab,
            endianess=endianess,
            pad_vocab=args.pad_vocab,
        )
        print(f"Wrote {outfile}")
        return
    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab
    print(f"Vocab info: {vocab}")
    print(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_output_file(model_plus.paths, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-        outfile,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
        ftype,
        params,
        model,
        vocab,
        special_vocab,
        concurrency=args.concurrency,
        endianess=endianess,
        pad_vocab=args.pad_vocab,
    )
    print(f"Wrote {outfile}")
-if __name__ == "__main__":
+if __name__ == '__main__':
-    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
+    main()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,9 @@ else()
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    if (LLAMA_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(main)
    add_subdirectory(tokenize)
    add_subdirectory(parallel)
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
    ctx_params.seed      = 1234;
    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
+    ctx_params.n_batch   = 2048;
    ctx_params.mul_mat_q = mmq;
    ctx_params.n_threads       = params.n_threads;
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -26,6 +26,7 @@ struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
    int         keep_every = 0;
    bool        collect_output_weight = false;
 };
@ -42,6 +43,9 @@ private:
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name) const;
    void keep_imatrix(int ncall) const;
 };
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                    keep_imatrix(m_last_call);
                }
            }
        }
    } else {
@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                keep_imatrix(m_last_call);
            }
        }
    }
@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }
 void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
 }
 void IMatrixCollector::keep_imatrix(int ncall) const {
    auto file_name = m_params.ofile;
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
    save_imatrix(file_name.c_str());
 }
 void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
@ -248,7 +269,7 @@ static void process_logits(
    }
 }
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    }
    std::vector<float> logit_history;
    logit_history.resize(tokens.size());
    std::vector<float> prob_history;
    if (compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
    const int n_chunk_max = tokens.size() / n_ctx;
@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    std::vector<float> logits;
    if (compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
        std::vector<float> logits;
        const auto t_start = std::chrono::high_resolution_clock::now();
@ -321,9 +349,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
            if (compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
@ -338,16 +368,22 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
        if (compute_ppl) {
            const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);
            logits.clear();
        }
    }
    printf("\n");
    if (compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
@ -358,6 +394,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        } else {
            printf("Unexpected negative standard deviation of log(prob)\n");
        }
    }
    return true;
 }
@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
    StatParams sparams;
    bool compute_ppl = true;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
@ -381,13 +419,22 @@ int main(int argc, char ** argv) {
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
        } else if (arg == "--no-ppl") {
            compute_ppl = false;
        } else if (arg == "--keep-imatrix") {
            sparams.keep_every = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
        std::string arg{argv[iarg]};
        if (arg == "--no-ppl") {
            compute_ppl = false;
        } else {
            args.push_back(argv[iarg]);
        }
    }
    gpt_params params;
    params.n_batch = 512;
@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
-    bool OK = compute_imatrix(ctx, params);
+    bool OK = compute_imatrix(ctx, params, compute_ppl);
    if (!OK) {
        return 1;
    }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -562,6 +562,7 @@ struct test {
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
    static const bool vulkan;
    static const bool metal;
    static const bool gpu_blas;
    static const bool blas;
@ -643,6 +644,9 @@ struct test {
        if (opencl) {
            return "OpenCL";
        }
        if (vulkan) {
            return "Vulkan";
        }
        if (metal) {
            return "Metal";
        }
@ -658,7 +662,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "metal", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "type_k", "type_v",
@ -682,7 +686,7 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
+        if (field == "cuda" || field == "opencl"  || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
            field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
            return BOOL;
        }
@ -710,7 +714,7 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@ -738,6 +742,7 @@ const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@ -30,6 +30,7 @@ android {
        }
        externalNativeBuild {
            cmake {
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
            }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@ -6,7 +6,7 @@
 " Similarly, you could add an insert mode keybind with
 " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
 "
-" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
 " let g:llama_api_url = "192.168.1.10:8080"
 " llama_overrides can also be set through buffer/window scopes. For instance
 " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@ -82,6 +82,9 @@ func llama#doLlamaGen()
   endif
   let l:querydata.prompt = join(l:buflines, "\n")
   let l:curlcommand = copy(s:curlcommand)
   if exists("g:llama_api_key")
       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
   endif
   let l:curlcommand[2] = json_encode(l:querydata)
   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
 endfunction
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -0,0 +1,131 @@
 # MobileVLM
 Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
 for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
 ## Usage
 Build with cmake or run `make llava-cli` to build it.
 After building, run: `./llava-cli` to see the usage. For example:
 ```sh
 ./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
    --image path/to/an/image.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
 ```
 ## Model conversion
 - Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
 ```sh
 git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```
 3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
    --projector-type ldp
 ```
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
 python ./convert.py path/to/MobileVLM-1.7B
 ```
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
 ./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
 ## Android compile and run
 ### compile
 refer to `examples/llava/android/build_64.sh`
 ```sh
 mkdir examples/llava/android/build_64
 cd examples/llava/android/build_64
 ../build_64.sh
 ```
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ## some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
 /data/local/tmp/llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
    --image /data/local/tmp/demo.jpg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
 Susan Wise Bauer
 llama_print_timings:        load time =   23574.72 ms
 llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
 llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
 llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
 llama_print_timings:       total time =   34731.93 ms
 ```
 ### case 2
 **input**
 ```sh
 /data/local/tmp/llava-cli \
    -m /data/local/tmp/ggml-model-q4_k.gguf \
    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
    -t 4 \
    --image /data/local/tmp/cat.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
 The image depicts a cat sitting in the grass near some tall green plants.
 llama_print_timings:        load time =   23257.32 ms
 llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
 llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
 llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
 llama_print_timings:       total time =   34570.79 ms
 ```
 ## Minor shortcomings
 The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
 ## TODO
 - [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
 - [ ] Optimize LDP projector performance
      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
 - [ ] run MobileVLM on `Jetson Orin`
 - [ ] Support more model variants, such as `MobileVLM-3B`.
 ## contributor
 ```sh
 zhangjidong05, yangyang260, huyiming03, chenxiaotao03
 ```
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@ -0,0 +1,53 @@
 #!/bin/bash
 model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
 projector_name="mmproj-model-f16.gguf"
 llama_name="ggml-model-q4_k.gguf"
 img_dir="/Users/cxt/model/llm"
 img_name="demo.jpg"
 prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
 # img_name="cat.jpeg"
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 program_dir="build_64/bin"
 binName="llava-cli"
 n_threads=4
 deviceDir="/data/local/tmp"
 saveDir="output"
 if [ ! -d ${saveDir} ]; then
    mkdir ${saveDir}
 fi
 function android_run() {
    # # copy resource into device
    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
    # copy program into device
    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
    adb shell "chmod 0777 ${deviceDir}/${binName}"
    # run
    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
                                                 -m ${deviceDir}/${llama_name} \
                                                 --mmproj ${deviceDir}/${projector_name} \
                                                 -t ${n_threads} \
                                                 --image ${deviceDir}/${img_name} \
                                                 -p \"${prompt}\" \
                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
                                                 -m ${deviceDir}/${llama_name} \
                                                 --mmproj ${deviceDir}/${projector_name} \
                                                 -t ${n_threads} \
                                                 --image ${deviceDir}/${img_name} \
                                                 -p \"${prompt}\" \
                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
 }
 android_run
 echo "android_run is Done!"
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 cmake ../../../../ \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="arm64-v8a" \
 -DANDROID_PLATFORM=android-23 $1
 make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -2,17 +2,6 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <vector>
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@ -29,6 +18,19 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <vector>
 #include <sstream>
 #include <cinttypes>
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
 #define KEY_PROJ_TYPE "clip.projector_type"
 //
 // tensor name constants
@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_UNKNOWN,
 };
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP,           "mlp"     },
    { PROJECTOR_TYPE_LDP,          "ldp"    },
 };
 //
 // utilities to get data from a gguf file
@ -129,6 +148,91 @@ static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }
 static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
    switch (type) {
        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
        default:                return format("unknown type %d", type);
    }
 }
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    std::string result;
    for (size_t pos = 0; ; pos += search.length()) {
        auto new_pos = s.find(search, pos);
        if (new_pos == std::string::npos) {
            result += s.substr(pos, s.size() - pos);
            break;
        }
        result += s.substr(pos, new_pos - pos) + replace;
        pos = new_pos;
    }
    s = std::move(result);
 }
 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
    switch (type) {
        case GGUF_TYPE_STRING:
            return gguf_get_val_str(ctx_gguf, i);
        case GGUF_TYPE_ARRAY:
            {
                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
                int arr_n = gguf_get_arr_n(ctx_gguf, i);
                const void * data = gguf_get_arr_data(ctx_gguf, i);
                std::stringstream ss;
                ss << "[";
                for (int j = 0; j < arr_n; j++) {
                    if (arr_type == GGUF_TYPE_STRING) {
                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
                        // escape quotes
                        replace_all(val, "\\", "\\\\");
                        replace_all(val, "\"", "\\\"");
                        ss << '"' << val << '"';
                    } else if (arr_type == GGUF_TYPE_ARRAY) {
                        ss << "???";
                    } else {
                        ss << gguf_data_to_str(arr_type, data, j);
                    }
                    if (j < arr_n - 1) {
                        ss << ", ";
                    }
                }
                ss << "]";
                return ss.str();
            }
        default:
            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
    }
 }
 static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
 static projector_type clip_projector_type_from_string(const std::string & name) {
    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
        if (kv.second == name) {
            return kv.first;
        }
    }
    return PROJECTOR_TYPE_UNKNOWN;
 }
 //
 // image data
 //
@ -201,10 +305,44 @@ struct clip_vision_model {
    struct ggml_tensor * projection;
    // LLaVA projection
-    struct ggml_tensor * mm_0_w;
+    struct ggml_tensor * mm_0_w = NULL;
-    struct ggml_tensor * mm_0_b;
+    struct ggml_tensor * mm_0_b = NULL;
-    struct ggml_tensor * mm_2_w;
+    struct ggml_tensor * mm_2_w = NULL;
-    struct ggml_tensor * mm_2_b;
+    struct ggml_tensor * mm_2_b = NULL;
    // Yi type models with mlp+normalization projection
    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
    struct ggml_tensor * mm_1_b = NULL;
    struct ggml_tensor * mm_3_w = NULL;
    struct ggml_tensor * mm_3_b = NULL;
    struct ggml_tensor * mm_4_w = NULL;
    struct ggml_tensor * mm_4_b = NULL;
    // MobileVLM projection
    struct ggml_tensor * mm_model_mlp_1_w;
    struct ggml_tensor * mm_model_mlp_1_b;
    struct ggml_tensor * mm_model_mlp_3_w;
    struct ggml_tensor * mm_model_mlp_3_b;
    struct ggml_tensor * mm_model_block_1_block_0_0_w;
    struct ggml_tensor * mm_model_block_1_block_0_1_w;
    struct ggml_tensor * mm_model_block_1_block_0_1_b;
    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
    struct ggml_tensor * mm_model_block_1_block_2_0_w;
    struct ggml_tensor * mm_model_block_1_block_2_1_w;
    struct ggml_tensor * mm_model_block_1_block_2_1_b;
    struct ggml_tensor * mm_model_block_2_block_0_0_w;
    struct ggml_tensor * mm_model_block_2_block_0_1_w;
    struct ggml_tensor * mm_model_block_2_block_0_1_b;
    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
    struct ggml_tensor * mm_model_block_2_block_2_0_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };
 struct clip_ctx {
@ -213,6 +351,7 @@ struct clip_ctx {
    bool has_llava_projector = false;
    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
    float image_mean[3];
    float image_std[3];
@ -330,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }
@ -430,9 +570,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            free(patches_data);
        }
        // shape [1, 576, 1024]
        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);
-        // mm projection 0
+        // print_tensor_info(embeddings, "embeddings");
        // llava projector
        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
@ -440,6 +585,141 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
            // First LayerNorm
            embeddings = ggml_norm(ctx0, embeddings, eps);
            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
                                model.mm_1_b);
            // GELU activation
            embeddings = ggml_gelu(ctx0, embeddings);
            // Second linear layer
            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
            // Second LayerNorm
            embeddings = ggml_norm(ctx0, embeddings, eps);
            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
                                model.mm_4_b);
        }
        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projector
            int n_patch = 24;
            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
            mlp_1 = ggml_gelu(ctx0, mlp_1);
            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
            // block 1
            struct ggml_tensor * block_1 = nullptr;
            {
                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                // stride = 1, padding = 1, bias is nullptr
                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
                // layer norm
                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
                // residual
                block_1 = ggml_add(ctx0, mlp_3, block_1);
            }
            // block_2
            {
                // stride = 2
                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // layer norm
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                // hardswish
                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
                // not sure the parameters is right for globalAvgPooling
                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                // pointwise conv
                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
                block_1 = ggml_relu(ctx0, block_1);
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
                block_1 = ggml_hardsigmoid(ctx0, block_1);
                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
                int w = block_1->ne[0], h = block_1->ne[1];
                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
                block_1 = ggml_norm(ctx0, block_1, eps);
                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
            }
            embeddings = block_1;
        }
        else {
            GGML_ASSERT(false);
        }
    }
    // build the graph
@ -485,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
    // kv
    if (verbosity >= 3) {
    const int n_kv = gguf_get_n_kv(ctx);
    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
-        for (int i = 0; i < n_kv; ++i) {
+        for (int i = 0; i < n_tensors; i++) {
-            const char * key = gguf_get_key(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            n_type[type]++;
        }
        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
            const std::string type_name =
                type == GGUF_TYPE_ARRAY
                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
                : gguf_type_name(type);
            std::string value          = gguf_kv_to_str(ctx, i);
            const size_t MAX_VALUE_LEN = 40;
            if (value.size() > MAX_VALUE_LEN) {
                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
            }
            replace_all(value, "\n", "\\n");
            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }
        // print type counts
        for (auto & kv : n_type) {
            if (kv.second == 0) {
                continue;
            }
            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
        printf("\n");
    }
    // data
@ -503,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            size_t tensor_size = ggml_nbytes(cur);
            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
    }
@ -517,6 +829,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    clip_ctx * new_clip = new clip_ctx;
    // update projector type
    {
        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
        if (idx != -1) {
            const std::string proj_type = gguf_get_val_str(ctx, idx);
            new_clip->proj_type = clip_projector_type_from_string(proj_type);
        }
        else {
            new_clip->proj_type = PROJECTOR_TYPE_MLP;
        }
        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
            }
        }
    }
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@ -661,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
        // LLaVA projection
        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
            try {
                // Yi-type llava
                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
            } catch (std::runtime_error & e) {  }
            try {
                // missing in Yi-type llava
                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
            } catch (std::runtime_error & e) {  }
            try {
                // Yi-type llava
                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
            } catch (std::runtime_error & e) {  }
            try {
                // Yi-type llava
                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
            } catch (std::runtime_error & e) {  }
        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
        }
        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@ -949,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        ".*weight",
    };
    std::vector<uint8_t> read_data(512);
    std::vector<uint8_t> work(512);
    std::vector<float> conv_buf(512);
    std::vector<int64_t> hist_all(1 << 4, 0);
@ -1100,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
    }
    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
        return ctx->vision_model.mm_2_b->ne[0];
    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
        return ctx->vision_model.mm_3_b->ne[0];
    }
    else {
        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
    }
 }
 int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
-
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        n_patches /= 4;
    }
    return n_patches;
 }
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
    # add projector type
    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")
@ -218,7 +221,8 @@ if has_llava_projector:
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
-    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
+    std::string system_prompt, user_prompt;
-    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
+    size_t image_pos = prompt.find("<image>");
    if (image_pos != std::string::npos) {
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
        // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
        size_t pos = 0;
        while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
            user_prompt.replace(pos, 2, "\n");
            pos += 1; // Advance past the replaced newline
        }
        while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
            system_prompt.replace(pos, 2, "\n");
            pos += 1; // Advance past the replaced newline
        }
        printf("system_prompt: %s\n", system_prompt.c_str());
        printf("user_prompt: %s\n", user_prompt.c_str());
    } else {
        // llava-1.5 native mode
        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
        user_prompt = prompt + "\nASSISTANT:";
    }
    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
    // generate the response
@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
        printf("%s", tmp);
        fflush(stdout);
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 static inline int nearest_int(float fval) {
    //assert(fval <= 4194303.f);
    float val = fval + 12582912.f;
    int i; memcpy(&i, &val, sizeof(int));
    return (i & 0x007fffff) - 0x00400000;
 }
 static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
    float max_logit = logits[0];
    float min_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) {
        max_logit = std::max(max_logit, logits[i]);
        min_logit = std::min(min_logit, logits[i]);
    }
    min_logit = std::max(min_logit, max_logit - 16);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    const float log_sum_exp = log(sum_exp);
    const float min_log_prob = min_logit - max_logit - log_sum_exp;
    const float scale = (max_logit - min_logit)/65535.f;
    float * d = (float *)log_prob;
    d[0] = scale;
    d[1] = min_log_prob;
    log_prob += 4;
    if (scale) {
        const float inv_scale = 1/scale;
        for (int i = 0; i < n_vocab; ++i) {
            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
        }
    } else {
        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
    }
    return max_logit + log_sum_exp - logits[tok];
 }
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history
@ -147,6 +184,130 @@ static void process_logits(
    }
 }
 static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
        double local_nll  = 0;
        double local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
            local_nll += v;
            local_nll2 += v*v;
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
 }
 struct kl_divergence_result {
    double sum_nll  = 0;
    double sum_nll2 = 0;
    double sum_kld  = 0;
    double sum_kld2 = 0;
    double sum_nll_diff  = 0;
    double sum_nll_diff2 = 0;
    size_t n_same_top = 0;
    size_t count = 0;
 };
 static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
    float max_logit = logits[0];
    int imax = 0;
    for (int i = 1; i < n_vocab; ++i) {
        if (logits[i] > max_logit) {
            max_logit = logits[i];
            imax = i;
        }
    }
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    const float log_sum_exp = log(sum_exp);
    const float * d = (const float *)base_log_prob;
    const float scale = d[0];
    const float min_log_prob = d[1];
    base_log_prob += 4;
    float nll = max_logit + log_sum_exp - logits[tok];
    kld.sum_nll  += nll;
    kld.sum_nll2 += nll*nll;
    nll += (scale*base_log_prob[tok] + min_log_prob);
    kld.sum_nll_diff  += nll;
    kld.sum_nll_diff2 += nll*nll;
    max_logit += log_sum_exp;
    double sum = 0;
    int imax_base = -1;
    float p_log_base_max = 0;
    for (int i = 0; i < n_vocab; ++i) {
        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
        if (i == 0 || p_log_base > p_log_base_max) {
            p_log_base_max = p_log_base;
            imax_base = i;
        }
        if (p_log_base > -16.f) {
            const float p_base = expf(p_log_base);
            sum += p_base * (p_log_base - logits[i] + max_logit);
        }
    }
    kld.sum_kld  += sum;
    kld.sum_kld2 += sum*sum;
    ++kld.count;
    if (imax == imax_base) ++kld.n_same_top;
    return sum;
 }
 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
        float * kld_values) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
        kl_divergence_result local_kld;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                kld.sum_nll  += local_kld.sum_nll;
                kld.sum_nll2 += local_kld.sum_nll2;
                kld.sum_kld  += local_kld.sum_kld;
                kld.sum_kld2 += local_kld.sum_kld2;
                kld.sum_nll_diff  += local_kld.sum_nll_diff;
                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
                kld.n_same_top += local_kld.n_same_top;
                kld.count += local_kld.count;
                break;
            }
            lock.unlock();
            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
            kld_values[i] = (float)v;
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
 }
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -294,6 +455,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
        logits_stream.open(params.logits_file.c_str());
        if (!logits_stream.is_open()) {
            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
            return {};
        }
        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
        logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
    }
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,6 +509,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    std::vector<uint16_t> log_probs;
    if (!params.logits_file.empty()) {
        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
        const int nv = 2*((n_vocab + 1)/2) + 4;
        log_probs.resize(n_ctx * nv);
    }
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
@ -398,8 +580,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // process the entire prompt.
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        if (!params.logits_file.empty()) {
            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, log_probs, nll, nll2);
        } else {
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        }
        count += n_ctx - first - 1;
        // perplexity is e^(average negative log-likelihood)
@ -458,23 +645,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
    return true;
 }
 #define K_TOKEN_CHUNK 4
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
    constexpr int k_token_chunk = 4;
    if (eval_results.size() != eval_pairs.size()) {
        eval_results.resize(eval_pairs.size());
    }
    if (eval_pairs.empty()) return;
-    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());
+    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
    std::atomic<int> counter(0);
    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
-        float local_logprobs[k_token_chunk];
+        float local_logprobs[K_TOKEN_CHUNK];
        while (true) {
-            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
+            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + k_token_chunk, eval_results.size());
+            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
            for (size_t i = first; i < last; ++i) {
                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
                float max_logit = logits[0];
@ -497,7 +685,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
    for (size_t it = 0; it < max_threads; ++it) {
        workers[it].join();
    }
 }
 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@ -540,14 +727,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
    // Number of tasks to use when computing the score
    if (params.hellaswag_tasks < hs_task_count) {
        hs_task_count = params.hellaswag_tasks;
    }
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
    std::mt19937 rng(1);
@ -1031,6 +1218,566 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 static bool deserialize_string(std::istream & in, std::string & str) {
    uint32_t size;
    if (!in.read((char *)&size, sizeof(size)).fail()) {
        str.resize(size);
        if (!in.read((char *)&str[0], size).fail()) return true;
    }
    return false;
 }
 struct multiple_choice_answers {
    std::vector<std::string> answers;
    std::vector<int>         labels;
    bool deserialize(std::istream& in) {
        uint32_t n;
        in.read((char *)&n, sizeof(n));
        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
        answers.resize(n);
        labels.resize(n);
        for (auto& a : answers) {
            if (!deserialize_string(in, a)) return false;
        }
        in.read((char *)labels.data(), n*sizeof(int));
        return !in.fail();
    }
 };
 struct multiple_choice_task {
    std::string question;         // the question (or context that needs to be continued)
    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
    bool deserialize(std::istream& in) {
        if (!deserialize_string(in, question)) return false;
        return mc1.deserialize(in) && mc2.deserialize(in);
    }
    // For evaluation
    size_t i_batch;         // starting index in the llama_batch
    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
    size_t required_tokens; // needed number of tokens to evaluate all answers
    std::vector<std::vector<llama_token>> seq_tokens;
    std::vector<float> log_probs;
 };
 static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
            printf("%s: found bad task with empty question and/or answers\n", __func__);
        }
        return false;
    }
    task.seq_tokens.reserve(task.mc1.answers.size());
    for (auto& answer : task.mc1.answers) {
        if (answer.empty()) {
            if (log_error) {
                printf("%s: found empty answer\n", __func__);
            }
            return false;
        }
        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {
        min_len = std::min(min_len, seq.size());
    }
    task.common_prefix = 0;
    for (size_t k = 0; k < min_len; ++k) {
        auto token = task.seq_tokens[0][k];
        bool all_same = true;
        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
            if (task.seq_tokens[i][k] != token) {
                all_same = false;
                break;
            }
        }
        if (!all_same) {
            break;
        }
        ++task.common_prefix;
    }
    task.required_tokens = task.common_prefix;
    for (auto& seq : task.seq_tokens) {
        task.required_tokens += seq.size() - task.common_prefix;
    }
    return true;
 }
 //
 // Calculates score for multiple choice tasks with single correct answer from prompt.
 // Commonly used LLM evaluation metrics of this type are
 //   * ARC
 //   * HellaSwag
 //   * MMLU
 //   * TruthfulQA
 //
 // Validation datasets for these 4 tests can be found at
 //     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
 // The data for these datasets was extracted from
 //     git@hf.co:datasets/allenai/ai2_arc
 //     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
 //     git@hf.co:datasets/Stevross/mmlu
 //     https://huggingface.co/datasets/truthful_qa
 //
 static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
    std::istringstream strstream(params.prompt);
    uint32_t n_task;
    strstream.read((char *)&n_task, sizeof(n_task));
    if (strstream.fail() || n_task == 0) {
        printf("%s: no tasks\n", __func__);
        return;
    }
    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
        printf("%s: failed to raad task positions from prompt\n", __func__);
        return;
    }
    std::vector<multiple_choice_task> tasks;
    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
        // Use all tasks
        tasks.resize(n_task);
        printf("%s: reading tasks", __func__);
        int n_dot = n_task/100;
        int i = 0;
        for (auto& task : tasks) {
            ++i;
            if (!task.deserialize(strstream)) {
                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
                return;
            }
            if (i%n_dot == 0) printf(".");
        }
        printf("done\n");
    }
    else {
        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
        std::mt19937 rng(1);
        std::vector<int> aux(n_task);
        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
        float scale = 1.f/(1.f + (float)std::mt19937::max());
        tasks.resize(params.multiple_choice_tasks);
        for (auto& task : tasks) {
            int j = (int)(scale * rng() * aux.size());
            int idx = aux[j];
            aux[j] = aux.back();
            aux.pop_back();
            strstream.seekg(task_pos[idx], std::ios::beg);
            if (!task.deserialize(strstream)) {
                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                return;
            }
        }
        n_task = params.multiple_choice_tasks;
    }
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    printf("%s: preparing task data", __func__);
    fflush(stdout);
    if (n_task > 500) {
        printf("...");
        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
            int num_tasks = tasks.size();
            int n_bad_local = 0;
            while (true) {
                int first = counter.fetch_add(K_TOKEN_CHUNK);
                if (first >= num_tasks) {
                    if (n_bad_local > 0) n_bad += n_bad_local;
                    break;
                }
                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
                for (int i = first; i < last; ++i) {
                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
                }
            }
        };
        size_t max_thread = std::thread::hardware_concurrency();
        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
        std::vector<std::thread> workers(max_thread-1);
        for (auto& w : workers) w = std::thread(prepare);
        prepare();
        for (auto& w : workers) w.join();
        printf("done\n");
        fflush(stdout);
        int nbad = n_bad;
        if (nbad > 0) {
            printf("%s: found %d malformed tasks\n", __func__, nbad);
            return;
        }
    } else {
        int n_dot = n_task/100;
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
                return;
            }
            if (i_task%n_dot == 0) {
                printf(".");
                fflush(stdout);
            }
        }
        printf("done\n");
    }
    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
    printf("\ntask\tacc_norm\n");
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_batch = params.n_batch;
    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;
    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
    std::vector<float> tok_logits(n_vocab);
    std::vector<float> batch_logits(n_vocab*n_ctx);
    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
    std::vector<std::thread> workers(std::thread::hardware_concurrency());
    std::vector<int> batch_indeces;
    int n_done = 0;
    int n_correct = 0;
    int n_tot_answers = 0;
    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
        int n_cur = 0;
        size_t i1 = i0;
        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
        llama_batch_clear(batch);
        // batch as much tasks as possible into the available context
        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
            auto& cur_task = tasks[i1];
            int num_answers = cur_task.seq_tokens.size();
            if (s0 + num_answers > max_seq) {
                break;
            }
            if (int(batch_indeces.size()) != num_answers) {
                batch_indeces.resize(num_answers);
            }
            for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
                }
            }
            s0 += num_answers;
            cur_task.i_batch = i_batch;
            i_batch += cur_task.required_tokens;
            n_cur += cur_task.required_tokens;
            if (++i1 == tasks.size()) {
                break;
            }
        }
        if (i0 == i1) {
            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }
        llama_kv_cache_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }
        // Compute log-probs in parallel
        // First we collect all tasks
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto& cur_task = tasks[i];
            size_t li = cur_task.common_prefix;
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
                }
                ++li;
            }
        }
        // Then we do the actual calculation
        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
        size_t ir = 0;
        // compute the logprobs for each ending of the decoded tasks
        for (size_t i = i0; i < i1; ++i) {
            auto & cur_task = tasks[i];
            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
            //    if (cur_task.mc1.labels[j] == 1) {
            //        printf("%d", j+1);
            //    }
            //}
            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
            const auto first_probs = softmax(tok_logits);
            cur_task.log_probs.resize(cur_task.seq_tokens.size());
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                size_t count = 1;
                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
                    //printf("        %zu  %g\n", ir, eval_results[ir]);
                    ++count;
                    log_prob += eval_results[ir++];
                }
                cur_task.log_probs[s] = log_prob / count;
                //printf("        Final: %g\n", log_prob / count);
                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
            }
            // Find the ending with maximum logprob
            size_t logprob_max_idx = 0;
            float  logprob_max_val = cur_task.log_probs[0];
            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
                if (cur_task.log_probs[s] > logprob_max_val) {
                    logprob_max_val = cur_task.log_probs[s];
                    logprob_max_idx = s;
                }
            }
            n_tot_answers += cur_task.log_probs.size();
            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
                ++n_correct;
            }
            ++n_done;
            // Print the accumulated accuracy mean x 100
            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
            fflush(stdout);
        }
        i0 = i1 - 1;
    }
    llama_batch_free(batch);
    if (n_done < 100) return;
    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    p = 1.f*n_done/n_tot_answers;
    sigma = sqrt(p*(1-p)/(n_done-1));
    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    printf("\n");
 }
 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (params.logits_file.empty()) {
        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
    }
    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
    if (!in) {
        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
        return;
    }
    {
        char check[9]; check[8] = 0;
        in.read(check, 8);
        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
            return;
        }
    }
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }
    int n_vocab, n_chunk;
    in.read((char *)&n_vocab, sizeof(n_vocab));
    in.read((char *)&n_chunk, sizeof(n_chunk));
    if (in.fail()) {
        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }
    std::vector<llama_token> tokens(n_ctx * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    const int n_batch = params.n_batch;
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
    std::vector<float> logits;
    if (num_batches > 1) {
        logits.reserve(n_ctx * n_vocab);
    }
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
        if (count < 1) {
            return std::make_pair(0., 0.);
        }
        double f = sum/count;
        double df = sum2/count - f*f;
        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
        return std::make_pair(f, df);
    };
    kl_divergence_result kld;
    auto kld_ptr = kld_values.data();
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
        const auto t_start = std::chrono::high_resolution_clock::now();
        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
            return;
        }
        // clear the KV cache
        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
            // save original token and restore it after eval
            const auto token_org = tokens[batch_start];
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
            if (num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
        }
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                workers, log_probs_uint16, kld, kld_ptr);
        kld_ptr += n_ctx - 1 - first;
        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
        auto p_top = 1.*kld.n_same_top/kld.count;
        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
                p_top, d_p_top);
        fflush(stdout);
        logits.clear();
    }
    printf("\n");
    if (kld.count < 100) return; // we do not wish to do statistics on so few values
    std::sort(kld_values.begin(), kld_values.end());
    printf("===== KL-divergence statistics\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];
    printf("Median : %10.6f\n", kld_median);
    auto percentile = [&kld_values] (float fraction) {
        if (fraction <= 0) return kld_values.front();
        if (fraction >= 1) return kld_values.back();
        float p = fraction*(kld_values.size() - 1);
        size_t ip = size_t(p); p -= ip;
        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
    };
    printf("Maximum: %10.6f\n", kld_values.back());
    printf("KLD_99 : %10.6f\n", percentile(0.99f));
    printf("KLD_95 : %10.6f\n", percentile(0.95f));
    printf("KLD_90 : %10.6f\n", percentile(0.90f));
    printf("Minimum: %10.6f\n", kld_values.front());
    printf("KLD_01 : %10.6f\n", percentile(0.01f));
    printf("KLD_05 : %10.6f\n", percentile(0.05f));
    printf("KLD_10 : %10.6f\n", percentile(0.10f));
 }
 int main(int argc, char ** argv) {
    gpt_params params;
@ -1091,6 +1838,10 @@ int main(int argc, char ** argv) {
        hellaswag_score(ctx, params);
    } else if (params.winogrande) {
        winogrande_score(ctx, params);
    } else if (params.multiple_choice) {
        multiple_choice_score(ctx, params);
    } else if (params.kl_divergence) {
        kl_divergence(ctx, params);
    } else {
        results = perplexity(ctx, params);
    }
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@ -1,14 +1,14 @@
 # Function calling example using pydantic models.
 import datetime
 import importlib
 import json
 from enum import Enum
-from typing import Union, Optional
+from typing import Optional, Union
 import requests
 from pydantic import BaseModel, Field
-
+from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
-import importlib
+                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
 from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
 # Function to get completion on the llama.cpp server with grammar.
@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
        print(self.message)
-# Enum for the calculator function.
+# Enum for the calculator tool.
 class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
@ -43,7 +43,7 @@ class MathOperation(Enum):
    DIVIDE = "divide"
-# Very simple calculator tool for the agent.
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
    return datetime.datetime.now().strftime(output_format)
 # Enum for the calculator tool.
 class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
    MULTIPLY = "multiply"
    DIVIDE = "divide"
 # Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")
    def run(self):
        if self.operation == MathOperation.ADD:
            return self.number_one + self.number_two
        elif self.operation == MathOperation.SUBTRACT:
            return self.number_one - self.number_two
        elif self.operation == MathOperation.MULTIPLY:
            return self.number_one * self.number_two
        elif self.operation == MathOperation.DIVIDE:
            return self.number_one / self.number_two
        else:
            raise ValueError("Unknown operation.")
 # Example function to get the weather
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@ -1,15 +1,21 @@
 from __future__ import annotations
 import inspect
 import json
 import re
 from copy import copy
-from inspect import isclass, getdoc
+from enum import Enum
-from types import NoneType
+from inspect import getdoc, isclass
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
 from docstring_parser import parse
-from pydantic import BaseModel, create_model, Field
+from pydantic import BaseModel, Field, create_model
-from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
+
-from enum import Enum
+if TYPE_CHECKING:
-from typing import get_type_hints, Callable
+    from types import GenericAlias
-import re
+else:
    # python 3.8 compat
    from typing import _GenericAlias as GenericAlias
 class PydanticDataType(Enum):
@ -43,7 +49,7 @@ class PydanticDataType(Enum):
    SET = "set"
-def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
+def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
    if isclass(pydantic_type) and issubclass(pydantic_type, str):
        return PydanticDataType.STRING.value
    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
        return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) == list:
+    elif get_origin(pydantic_type) is list:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) == set:
+    elif get_origin(pydantic_type) is set:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) == Union:
+    elif get_origin(pydantic_type) is Union:
        union_types = get_args(pydantic_type)
        union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
        return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) == Optional:
+    elif get_origin(pydantic_type) is Optional:
        element_type = get_args(pydantic_type)[0]
        return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
    elif isclass(pydantic_type):
        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) == dict:
+    elif get_origin(pydantic_type) is dict:
        key_type, value_type = get_args(pydantic_type)
        return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
    else:
@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
        return f"{cls.__name__.lower()} ::= " + " | ".join(members)
    if cls.__annotations__ and cls.__annotations__ != {}:
        result = f'{rule_name} ::= "{{"'
        type_list_rules = []
        # Modify this comprehension
        members = [
            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
@ -116,17 +121,15 @@ def get_members_structure(cls, rule_name):
        result += '"," '.join(members)
        result += '  "}"'
-        return result, type_list_rules
+        return result
-    elif rule_name == "custom-class-any":
+    if rule_name == "custom-class-any":
        result = f"{rule_name} ::= "
        result += "value"
-        type_list_rules = []
+        return result
-        return result, type_list_rules
+
    else:
    init_signature = inspect.signature(cls.__init__)
    parameters = init_signature.parameters
    result = f'{rule_name} ::=  "{{"'
        type_list_rules = []
    # Modify this comprehension too
    members = [
        f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
@ -136,7 +139,7 @@ def get_members_structure(cls, rule_name):
    result += '", "'.join(members)
    result += '  "}"'
-        return result, type_list_rules
+    return result
 def regex_to_gbnf(regex_pattern: str) -> str:
@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
 def generate_gbnf_rule_for_type(
    model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
-) -> Tuple[str, list]:
+) -> tuple[str, list[str]]:
    """
    Generate GBNF rule for a given field type.
@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
    :param field_info: Additional information about the field (optional).
    :return: Tuple containing the GBNF type and a list of additional rules.
-    :rtype: Tuple[str, list]
+    :rtype: tuple[str, list]
    """
    rules = []
@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
        gbnf_type, rules = model_name + "-" + field_name, rules
    elif gbnf_type.startswith("custom-class-"):
-        nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
+        rules.append(get_members_structure(field_type, gbnf_type))
        rules.append(nested_model_rules)
    elif gbnf_type.startswith("custom-dict-"):
        key_type, value_type = get_args(field_type)
@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
        union_rules = []
        for union_type in union_types:
-            if isinstance(union_type, _GenericAlias):
+            if isinstance(union_type, GenericAlias):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
                union_rules.append(union_gbnf_type)
                rules.extend(union_rules_list)
-            elif not issubclass(union_type, NoneType):
+            elif not issubclass(union_type, type(None)):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
    else:
        gbnf_type, rules = gbnf_type, []
    if gbnf_type not in created_rules:
        return gbnf_type, rules
    else:
        if gbnf_type in created_rules:
    return gbnf_type, rules
-def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
+def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
    """
    Generate GBnF Grammar
@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
    ```
    """
    if model in processed_models:
-        return []
+        return [], False
    processed_models.add(model)
    model_name = format_model_and_field_name(model.__name__)
@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
 def generate_gbnf_grammar_from_pydantic_models(
-    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
    list_of_outputs: bool = False
 ) -> str:
    """
@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
    * grammar.
    Args:
-        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        list_of_outputs (str, optional): Allows a list of output objects
@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
        # root ::= UserModel | PostModel
        # ...
    """
-    processed_models = set()
+    processed_models: set[type[BaseModel]] = set()
    all_rules = []
-    created_rules = {}
+    created_rules: dict[str, list[str]] = {}
    if outer_object_name is None:
        for model in models:
            model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
    Returns:
        str: GBNF primitive grammar string.
    """
-    type_list = []
+    type_list: list[type[object]] = []
    if "string-list" in grammar:
        type_list.append(str)
    if "boolean-list" in grammar:
@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
 def generate_markdown_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate markdown documentation for a list of Pydantic models.
    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -731,7 +729,7 @@ def generate_markdown_documentation(
 def generate_field_markdown(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@ -739,8 +737,8 @@ def generate_field_markdown(
    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
+        field_type (type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -798,7 +796,7 @@ def generate_field_markdown(
    return field_text
-def format_json_example(example: dict, depth: int) -> str:
+def format_json_example(example: dict[str, Any], depth: int) -> str:
    """
    Format a JSON example into a readable string with indentation.
@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
 def generate_text_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate text documentation for a list of Pydantic models.
    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -885,7 +883,7 @@ def generate_text_documentation(
 def generate_field_text(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@ -893,8 +891,8 @@ def generate_field_text(
    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
+        field_type (type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
    pydantic_model_list,
    grammar_file_path="./generated_grammar.gbnf",
    documentation_file_path="./generated_grammar_documentation.md",
-    outer_object_name: str = None,
+    outer_object_name: str | None = None,
-    outer_object_content: str = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
 def generate_gbnf_grammar_and_documentation(
    pydantic_model_list,
-    outer_object_name: str = None,
+    outer_object_name: str | None = None,
-    outer_object_content: str = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
 def generate_gbnf_grammar_and_documentation_from_dictionaries(
-    dictionaries: List[dict],
+    dictionaries: list[dict[str, Any]],
-    outer_object_name: str = None,
+    outer_object_name: str | None = None,
-    outer_object_content: str = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    Generate GBNF grammar and documentation from a list of dictionaries.
    Args:
-        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        dictionaries (list[dict]): List of dictionaries representing Pydantic models.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        model_prefix (str): Prefix for the model section in the documentation.
@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    return grammar, documentation
-def create_dynamic_model_from_function(func: Callable):
+def create_dynamic_model_from_function(func: Callable[..., Any]):
    """
    Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
    sig = inspect.signature(func)
    # Parse the docstring
    assert func.__doc__ is not None
    docstring = parse(func.__doc__)
    dynamic_fields = {}
@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
        # Add parameter details to the schema
        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
        param_docs.append((param.name, param_doc))
        if param.default == inspect.Parameter.empty:
            default_value = ...
@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]
-    for param_doc in param_docs:
+    for name, param_doc in param_docs:
-        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
+        dynamic_model.model_fields[name].description = param_doc.description
    dynamic_model.__doc__ = docstring.short_description
@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
    return dynamic_model
-def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
+def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
    """
    Add a 'run' method to a dynamic Pydantic model, using the provided function.
    Args:
-        model (Type[BaseModel]): Dynamic Pydantic model class.
+        model (type[BaseModel]): Dynamic Pydantic model class.
        func (Callable): Function to be added as a 'run' method to the model.
    Returns:
-        Type[BaseModel]: Pydantic model class with the added 'run' method.
+        type[BaseModel]: Pydantic model class with the added 'run' method.
    """
    def run_method_wrapper(self):
@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
    return model
-def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
+def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
    """
    Create a list of dynamic Pydantic model classes from a list of dictionaries.
    Args:
-        dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (list[dict]): List of dictionaries representing model structures.
    Returns:
-        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
    """
    dynamic_models = []
    for func in dictionaries:
@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
    return Enum(enum_name, {value: value for value in values})
-def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
    """
    Convert a dictionary to a Pydantic model class.
@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
        model_name (str): Name of the generated Pydantic model.
    Returns:
-        Type[BaseModel]: Generated Pydantic model class.
+        type[BaseModel]: Generated Pydantic model class.
    """
-    fields = {}
+    fields: dict[str, Any] = {}
    if "properties" in dictionary:
        for field_name, field_data in dictionary.get("properties", {}).items():
@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,7 +1,7 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -30,7 +30,8 @@ Command line options:
 -   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 -   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 -   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
-
+-   `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 -   `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 ## Build
 server is build alongside everything else from the root of the project
@ -65,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
 ### Docker:
 ```bash
 docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
 # or, with CUDA:
 docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
 ```
 ## Testing with CURL
 Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@ -0,0 +1,208 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "utils.hpp"
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::json;
 inline static json oaicompat_completion_params_parse(
    const json &body /* openai api json semantics */)
 {
    json llama_params;
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    //
    // For parameters that are defined by the OpenAI documentation (e.g.
    // temperature), we explicitly specify OpenAI's intended default; we
    // need to do that because sometimes OpenAI disagrees with llama.cpp
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body, "stream", false);
    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
    // Handle 'stop' field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
    // Ensure there is ChatML-specific end sequence among stop words
    llama_params["stop"].push_back("<|im_end|>");
    return llama_params;
 }
 inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
 {
    json result = response.result_json;
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
    std::string content      = json_value(result, "content", std::string(""));
    std::string finish_reason = "length";
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    json choices =
        streaming ? json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"delta", json::object()}}})
                  : json::array({json{{"finish_reason", finish_reason},
                                        {"index", 0},
                                        {"message", json{{"content", content},
                                                         {"role", "assistant"}}}}});
    std::time_t t = std::time(0);
    json res =
        json{{"choices", choices},
            {"created", t},
            {"model",
                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
            {"usage",
                json{{"completion_tokens", num_tokens_predicted},
                     {"prompt_tokens",     num_prompt_tokens},
                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
            {"id", gen_chatcmplid()}};
    if (server_verbose) {
        res["__verbose"] = result;
    }
    if (result.contains("completion_probabilities")) {
        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
    }
    return res;
 }
 // return value is vector as there is one case where we might need to generate two responses
 inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
    json result = response.result_json;
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({response.result_json});
    }
    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
    bool stopped_word   = json_value(result, "stopped_word", false);
    bool stopped_eos    = json_value(result, "stopped_eos", false);
    bool stopped_limit  = json_value(result, "stopped_limit", false);
    std::string content = json_value(result, "content", std::string(""));
    std::string finish_reason;
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    if (stopped_limit) {
        finish_reason = "length";
    }
    std::time_t t = std::time(0);
    json choices;
    if (!finish_reason.empty()) {
        choices = json::array({json{{"finish_reason", finish_reason},
                                    {"index", 0},
                                    {"delta", json::object()}}});
    } else {
        if (first) {
            if (content.empty()) {
                choices = json::array({json{{"finish_reason", nullptr},
                                            {"index", 0},
                                            {"delta", json{{"role", "assistant"}}}}});
            } else {
                // We have to send this as two updates to conform to openai behavior
                json initial_ret = json{{"choices", json::array({json{
                                        {"finish_reason", nullptr},
                                        {"index", 0},
                                        {"delta", json{
                                            {"role", "assistant"}
                                        }}}})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                json second_ret = json{
                            {"choices", json::array({json{{"finish_reason", nullptr},
                                                            {"index", 0},
                                                            {"delta", json{
                                                            {"content", content}}}
                                                            }})},
                            {"created", t},
                            {"id", gen_chatcmplid()},
                            {"model", modelname},
                            {"object", "chat.completion.chunk"}};
                return std::vector<json>({initial_ret, second_ret});
            }
        } else {
            // Some idiosyncrasy in task processing logic makes several trailing calls
            // with empty content, we ignore these at the calee site.
            if (content.empty()) {
                return std::vector<json>({json::object()});
            }
            choices = json::array({json{
                {"finish_reason", nullptr},
                {"index", 0},
                {"delta",
                json{
                    {"content", content},
                }},
            }});
        }
    }
    json ret = json{{"choices", choices},
                    {"created", t},
                    {"id", gen_chatcmplid()},
                    {"model", modelname},
                    {"object", "chat.completion.chunk"}};
    return std::vector<json>({ret});
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -0,0 +1,508 @@
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 //
 // parallel
 //
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
    IDLE,
    PROCESSING,
 };
 enum slot_command
 {
    NONE,
    LOAD_PROMPT,
    RELEASE,
 };
 struct slot_params
 {
    bool stream       = true;
    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t  n_predict = -1; // new tokens to predict
    std::vector<std::string> antiprompt;
    json input_prefix;
    json input_suffix;
 };
 struct slot_image
 {
    int32_t id;
    bool request_encode_image = false;
    float * image_embedding = nullptr;
    int32_t image_tokens = 0;
    clip_image_u8 * img_data;
    std::string prefix_prompt; // before of this image
 };
 // completion token output with probabilities
 struct completion_token_output
 {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 static inline void server_log(const char *level, const char *function, int line,
                       const char *message, const nlohmann::ordered_json &extra)
 {
    nlohmann::ordered_json log
    {
        {"timestamp", time(nullptr)},
        {"level",     level},
        {"function",  function},
        {"line",      line},
        {"message",   message},
    };
    if (!extra.empty())
    {
        log.merge_patch(extra);
    }
    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
    printf("%.*s\n", (int)str.size(), str.data());
    fflush(stdout);
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 inline std::string format_chatml(std::vector<json> messages)
 {
    std::ostringstream chatml_msgs;
    for (auto it = messages.begin(); it != messages.end(); ++it) {
        chatml_msgs << "<|im_start|>"
                    << json_value(*it, "role",    std::string("user")) << '\n';
        chatml_msgs << json_value(*it, "content", std::string(""))
                    << "<|im_end|>\n";
    }
    chatml_msgs << "<|im_start|>assistant" << '\n';
    return chatml_msgs.str();
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_all_task_finished;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        return id++;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when the batch of tasks is finished
    void on_all_tasks_finished(std::function<void(void)> callback) {
        callback_all_task_finished = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // Start the main loop. This call is blocking
    [[noreturn]]
    void start_loop() {
        while (true) {
            // new task arrived
            LOG_VERBOSE("have new task", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {});
                    callback_new_task(task);
                }
                LOG_VERBOSE("callback_all_task_finished", {});
                // process and update all the multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is finished
                callback_all_task_finished();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    condition_tasks.wait(lock, [&]{
                        return !queue_tasks.empty();
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    void add_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    void remove_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            LOG_VERBOSE("condition_results unblock", {});
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {});
                queue_results.push_back(result);
                condition_results.notify_one();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -0,0 +1,9 @@
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -0,0 +1,47 @@
 # llama.cpp/example/sycl
 This example program provide the tools for llama.cpp for SYCL on Intel GPU.
 ## Tool
 |Tool Name| Function|Status|
 |-|-|-|
 |ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
 ### ls-sycl-device
 List all SYCL devices with ID, compute capability, max work group size, ect.
 1. Build the llama.cpp for SYCL for all targets.
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. Execute
 ```
 ./build/bin/ls-sycl-device
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -0,0 +1,20 @@
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
 #build all binary
 cmake --build . --config Release -v
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -0,0 +1,11 @@
 /*MIT license
  Copyright (C) 2024 Intel Corporation
  SPDX-License-Identifier: MIT
 */
 #include "ggml-sycl.h"
 int main(int argc, char ** argv) {
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -0,0 +1,19 @@
 #!/bin/bash
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 if [ $# -gt 0 ]; then
    export GGML_SYCL_DEVICE=$1
 else
    export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1705133751,
+        "lastModified": 1706191920,
-        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
+        "narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
+        "rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -1,3 +1,17 @@
 # The flake interface to llama.cpp's Nix expressions. The flake is used as a
 # more discoverable entry-point, as well as a way to pin the dependencies and
 # expose default outputs, including the outputs built by the CI.
 # For more serious applications involving some kind of customization  you may
 # want to consider consuming the overlay, or instantiating `llamaPackages`
 # directly:
 #
 # ```nix
 # pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
 # ```
 # Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
 # of the relation between Nix and the Nix Flakes.
 {
  description = "Port of Facebook's LLaMA model in C/C++";
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
        } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
+                    __func__, tensor->name, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
            return;
        }
@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
 }
 size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    return alloc->max_size;
+    // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
    // to avoid this, we add a 10% margin to the buffer size
    return alloc->max_size + alloc->max_size/10;
 }
 // graph allocator
@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
 }
 // utils
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-    size_t alignment = ggml_backend_buft_get_alignment(buft);
+static bool alloc_tensor_range(struct ggml_context * ctx,
-
+        struct ggml_tensor * first, struct ggml_tensor * last,
-    size_t nbytes = 0;
+        ggml_backend_buffer_type_t buft, size_t size,
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
-        if (t->data == NULL && t->view_src == NULL) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
        }
    }
    if (nbytes == 0) {
        // all the tensors in the context are already allocated
 #ifndef NDEBUG
        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
 #endif
        return NULL;
    }
    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
    if (buffer == NULL) {
        // failed to allocate buffer
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
-        return NULL;
+        for (size_t i = 0; i < *n_buffers; i++) {
            ggml_backend_buffer_free(*buffers[i]);
        }
        free(buffers);
        return false;
    }
    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
        if (t->data == NULL) {
            if (t->view_src == NULL) {
                ggml_tallocr_alloc(tallocr, t);
@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
    ggml_tallocr_free(tallocr);
    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
    (*buffers)[(*n_buffers)++] = buffer;
    return true;
 }
 ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
    size_t alignment = ggml_backend_buft_get_alignment(buft);
    size_t max_size = ggml_backend_buft_get_max_size(buft);
    ggml_backend_buffer_t * buffers = NULL;
    size_t n_buffers = 0;
    size_t cur_buf_size = 0;
    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
        size_t this_size = 0;
        if (t->data == NULL && t->view_src == NULL) {
            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
        }
        if (this_size > max_size) {
            // tensor is too large to fit in a single buffer
            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                    __func__, t->name,
                    ggml_backend_buft_name(buft),
                    this_size, max_size);
            for (size_t i = 0; i < n_buffers; i++) {
                ggml_backend_buffer_free(buffers[i]);
            }
            free(buffers);
            return NULL;
        }
        if ((cur_buf_size + this_size) > max_size) {
            // allocate tensors in the current buffer
            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                return NULL;
            }
            first = t;
            cur_buf_size = this_size;
        } else {
            cur_buf_size += this_size;
        }
    }
    // allocate remaining tensors
    if (cur_buf_size > 0) {
        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
            return NULL;
        }
    }
    if (n_buffers == 0) {
        // all the tensors in the context are already allocated
 #ifndef NDEBUG
        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
 #endif
        return NULL;
    }
    ggml_backend_buffer_t buffer;
    if (n_buffers == 1) {
        buffer = buffers[0];
    } else {
        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
    }
    free(buffers);
    return buffer;
 }
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -19,6 +19,7 @@ extern "C" {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
        // check if tensor data is in host memory
@ -63,6 +64,11 @@ extern "C" {
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // buffer that contains a collection of buffers
    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend
    //
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_alignment(buft);
 }
 size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
    // get_max_size is optional, defaults to SIZE_MAX
    if (buft->iface.get_max_size) {
        return buft->iface.get_max_size(buft);
    }
    return SIZE_MAX;
 }
 GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
    // get_alloc_size is optional, defaults to ggml_nbytes
    if (buft->iface.get_alloc_size) {
-        return buft->iface.get_alloc_size(buft, tensor);
+        size_t size = buft->iface.get_alloc_size(buft, tensor);
        assert(size >= ggml_nbytes(tensor));
        return size;
    }
    return ggml_nbytes(tensor);
 }
@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
               size_t                          size) {
    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
    GGML_ASSERT(iface.get_base != NULL);
    (*buffer) = (struct ggml_backend_buffer) {
        /* .interface = */ iface,
        /* .buft      = */ buft,
@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
 }
 size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
 }
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
 void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
    buffer->usage = usage;
    // FIXME: add a generic callback to the buffer interface
    if (ggml_backend_buffer_is_multi_buffer(buffer)) {
        ggml_backend_multi_buffer_set_usage(buffer, usage);
    }
 }
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
    return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
 }
 size_t ggml_backend_get_max_size(ggml_backend_t backend) {
    return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
 }
 void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -337,11 +358,21 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    ggml_backend_cuda_reg_devices();
 #endif
 #ifdef GGML_USE_SYCL
    extern void ggml_backend_sycl_reg_devices(void);
    ggml_backend_sycl_reg_devices();
 #endif
 #ifdef GGML_USE_METAL
    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
    ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
 #ifdef GGML_USE_VULKAN
    extern GGML_CALL int ggml_backend_vk_reg_devices(void);
    ggml_backend_vk_reg_devices();
 #endif
 }
 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@ -545,6 +576,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
@ -600,6 +632,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
@ -756,6 +789,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
    GGML_UNUSED(user_data);
 }
 // multi-buffer buffer
 struct ggml_backend_multi_buffer_context {
    ggml_backend_buffer_t * buffers;
    size_t n_buffers;
 };
 typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
 GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
    return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
 }
 GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
        ggml_backend_buffer_free(ctx->buffers[i]);
    }
    free(ctx->buffers);
    free(ctx);
 }
 GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
        ggml_backend_buffer_clear(ctx->buffers[i], value);
    }
 }
 static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
    static struct ggml_backend_buffer_i multi_backend_buffer_i = {
        /* .get_name        = */ ggml_backend_multi_buffer_get_name,
        /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
        /* .get_base        = */ NULL,
        /* .init_tensor     = */ NULL,
        /* .set_tensor      = */ NULL,
        /* .get_tensor      = */ NULL,
        /* .cpy_tensor      = */ NULL,
        /* .clear           = */ ggml_backend_multi_buffer_clear,
        /* .reset           = */ NULL,
    };
    return multi_backend_buffer_i;
 }
 GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
    ctx->n_buffers = n_buffers;
    ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
    size_t total_size = 0;
    for (size_t i = 0; i < n_buffers; i++) {
        ctx->buffers[i] = buffers[i];
        total_size += ggml_backend_buffer_get_size(buffers[i]);
    }
    return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
 }
 GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
 }
 GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
    GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
        ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
    }
 }
 // scheduler
@ -1191,6 +1298,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                ggml_tallocr_t src_allocr = node_allocr(src);
                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                if (src_allocr != node_allocr) {
                    // create a copy of the input in the split's backend
                    size_t id = hash_id(src);
                    if (sched->node_copies[id][cur_backend_id] == NULL) {
                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
                        sched->node_copies[id][cur_backend_id] = tensor_copy;
                        node_allocr(tensor_copy) = cur_allocr;
                        SET_CAUSE(tensor_copy, "4.cpy");
                        int n_inputs = sched->splits[cur_split].n_inputs++;
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
                    node->src[j] = sched->node_copies[id][cur_backend_id];
 #if 0
                    // check if the input is already in the split
                    bool found = false;
                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@ -1206,19 +1331,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
-
+#endif
                    // create a copy of the input in the split's backend
                    size_t id = hash_id(src);
                    if (sched->node_copies[id][cur_backend_id] == NULL) {
                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
                        sched->node_copies[id][cur_backend_id] = tensor_copy;
                        node_allocr(tensor_copy) = cur_allocr;
                        SET_CAUSE(tensor_copy, "4.cpy");
                    }
                    node->src[j] = sched->node_copies[id][cur_backend_id];
                }
            }
        }
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -20,6 +20,7 @@ extern "C" {
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
@ -36,6 +37,7 @@ extern "C" {
    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
@ -54,6 +56,7 @@ extern "C" {
    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -13,6 +13,10 @@
 #include <map>
 #include <array>
 // stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@ -585,13 +589,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 [[noreturn]]
-static __device__ void bad_arch() {
+static __device__ void no_device_code(
-    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
           file_name, line, function_name, arch);
    (void) arch_list;
 #else
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
           file_name, line, function_name, arch, arch_list);
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();
-    (void) bad_arch; // suppress unused function warning
+    (void) no_device_code; // suppress unused function warning
 }
 #ifdef __CUDA_ARCH__
 #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 #else
 #define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@ -618,7 +637,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
    return a;
 #else
    (void) a;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }
@ -647,7 +666,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
    return x;
 #else
    (void) x;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }
@ -2444,7 +2463,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
    }
 #else
    (void) vx; (void) y; (void) k;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_PASCAL
 }
@ -2475,7 +2494,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2512,7 +2531,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2547,7 +2566,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2592,7 +2611,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2613,7 +2632,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
    return d8_0*d8_1 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2643,7 +2662,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2678,7 +2697,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2715,7 +2734,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2755,7 +2774,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
    return d3 * sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2780,7 +2799,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
    return d3*d8 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2813,7 +2832,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2846,7 +2865,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2886,7 +2905,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    return dm5f.x*sumf_d - dm5f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2919,7 +2938,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2949,7 +2968,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
    return d*sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -2980,7 +2999,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    return d6 * sumf_d;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
@ -3846,7 +3865,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    return dall * sumf_d - dmin * sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 #endif
@ -4029,7 +4048,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    return d * sumf_d;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 #endif
@ -4287,7 +4306,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
        q8 += 8;
        aux32 >>= 7;
    }
-    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * sumi;
 #else
    // iqs is 0...15
@ -4298,7 +4317,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@ -4343,7 +4362,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
        }
        q8 += 8;
    }
-    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 #else
    assert(false);
@ -4524,7 +4543,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4593,7 +4612,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4660,7 +4679,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4727,7 +4746,7 @@ mul_mat_q5_1(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4794,7 +4813,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4861,7 +4880,7 @@ mul_mat_q2_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4930,7 +4949,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -4999,7 +5018,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -5066,7 +5085,7 @@ mul_mat_q5_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -5135,7 +5154,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
@ -5858,7 +5877,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
    }
 #else
    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }
@ -10225,8 +10244,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
    // TODO: mmq/mmv support
 #endif
-    const int64_t nb11 = src1->nb[1];
+    const size_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
+    const size_t nb1  =  dst->nb[1];
    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
@ -10887,15 +10906,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
    if (ggml_is_quantized(tensor->type)) {
        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
+        size_t original_size = ggml_nbytes(tensor);
        int64_t row_high = ggml_nrows(tensor);
        int64_t nrows_split = row_high - row_low;
        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
        }
    }
 }
@ -10998,12 +11013,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
 }
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
+    size_t size = ggml_nbytes(tensor);
    int64_t row_high = ggml_nrows(tensor);
    int64_t nrows_split = row_high - row_low;
    size_t size = ggml_nbytes_split(tensor, nrows_split);
    int64_t ne0 = tensor->ne[0];
    if (ggml_is_quantized(tensor->type)) {
@ -11032,6 +11042,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
@ -11307,6 +11318,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
@ -11386,6 +11398,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -24,19 +24,7 @@
 #define UNUSED(x) (void)(x)
 #define GGML_METAL_MAX_KERNELS 256
 struct ggml_metal_buffer {
    const char * name;
    void   * data;
    size_t   size;
    id<MTLBuffer> metal;
 };
 struct ggml_metal_kernel {
    id<MTLFunction>             function;
    id<MTLComputePipelineState> pipeline;
 };
@ -149,7 +137,10 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
@ -171,14 +162,10 @@ struct ggml_metal_context {
    id<MTLDevice>       device;
    id<MTLCommandQueue> queue;
    id<MTLLibrary>      library;
    dispatch_queue_t d_queue;
-    int n_buffers;
+    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
    bool support_simdgroup_reduction;
    bool support_simdgroup_mm;
@ -245,26 +232,24 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // Show all the Metal device instances in the system
    NSArray * devices = MTLCopyAllDevices();
    for (id<MTLDevice> device in devices) {
-        NSString * s = [device name];
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
    }
    [devices release]; // since it was created by a *Copy* C method
 #endif
    // Pick and show default Metal device
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    NSString * s = [device name];
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    ctx->device = device;
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
    id<MTLLibrary> metal_library;
    // load library
    {
        NSBundle * bundle = nil;
@ -279,7 +264,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
-            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
            if (error) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
@ -321,7 +306,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                //[options setFastMathEnabled:false];
-                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
                if (error) {
                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                    return NULL;
@ -386,8 +371,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    {
        NSError * error = nil;
-        for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
+        for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
            ctx->kernels[i].function = nil;
            ctx->kernels[i].pipeline = nil;
        }
@ -399,13 +383,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(e, name, supported) \
        if (supported) { \
            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
-            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+            id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
-            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
+            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
            [metal_function release]; \
            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
                    (int) kernel->pipeline.threadExecutionWidth); \
            if (error) { \
                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                [metal_library release]; \
                return NULL; \
            } \
        } else { \
@ -522,7 +508,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,    flash_attn_ext_f16_h64,  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,    flash_attn_ext_f16_h80,  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,    flash_attn_ext_f16_h96,  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,   flash_attn_ext_f16_h112, true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,   flash_attn_ext_f16_h128, true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,   flash_attn_ext_f16_h256, true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,            true);
@ -537,27 +526,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,                true);
    }
    [metal_library release];
    return ctx;
 }
 static void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
-    for (int i = 0; i < ctx->n_buffers; ++i) {
+    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
        [ctx->buffers[i].metal release];
    }
    for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
        if (ctx->kernels[i].pipeline) {
        [ctx->kernels[i].pipeline release];
    }
        if (ctx->kernels[i].function) {
            [ctx->kernels[i].function release];
        }
    }
    [ctx->library release];
    [ctx->queue release];
    [ctx->device release];
@ -589,15 +568,13 @@ struct ggml_backend_metal_buffer_context {
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    const int64_t tsize = ggml_nbytes(t);
    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
    // compatibility with ggml-backend
    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
    // find the view that contains the tensor fully
@ -619,25 +596,6 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    return nil;
 }
    // find the view that contains the tensor fully
    for (int i = 0; i < ctx->n_buffers; ++i) {
        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;
            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
            return ctx->buffers[i].metal;
        }
    }
    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
    return nil;
 }
 static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
@ -681,7 +639,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
            return true;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction;
+            return ctx->support_simdgroup_reduction &&
                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
@ -826,9 +785,9 @@ static bool ggml_metal_graph_compute(
            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
            //if (src0) {
@ -1610,7 +1569,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                            }
@ -1755,7 +1714,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
                            }
@ -2189,9 +2148,9 @@ static bool ggml_metal_graph_compute(
                        size_t offs_src3 = 0;
                        GGML_ASSERT(src2);
-                        id<MTLBuffer> id_src2 = ggml_metal_get_buffer(ctx, src2, &offs_src2);
+                        id<MTLBuffer> id_src2 = ggml_metal_get_buffer(src2, &offs_src2);
-                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(ctx, src3, &offs_src3) : nil;
+                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
                        const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
                        const int64_t  ne31 = src3 ? src3->ne[1] : 0;
@ -2213,7 +2172,10 @@ static bool ggml_metal_graph_compute(
                        switch (ne00) {
                            case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
                            case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
                            case 96:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
                            case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
                            case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
                            case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
                            default:
                                {
                                    GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
@ -2253,14 +2215,17 @@ static bool ggml_metal_graph_compute(
                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:26];
                        [encoder setBytes:&scale   length:sizeof(   float) atIndex:27];
                        // for small batches use more simdgroups (needs more tests, to confirm if it's worth it)
                        const int64_t nsg = ne01 < 4 ? 12 : 4;  // simdgroups per threadgroup (a.k.a. warps)
                        const int64_t nqptg = 8;  // queries per threadgroup    !! sync with kernel template arguments !!
-                        const int64_t ncpsg = 32; // cache values per simdgroup (does not work for other values)
+                        const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
-                      //const size_t smem = nqptg*(nhptg*ne00 + nsg*(nhptg*ne00 + 256))*(sizeof(float)/2);
+                        GGML_ASSERT(nqptg % 8  == 0);
-                        const size_t smem = nqptg*(ne00 + nsg*(ne00 + 1*ncpsg))*(sizeof(float)/2);
+                        GGML_ASSERT(ncpsg % 32 == 0);
                        // simdgroups per threadgroup (a.k.a. warps)
                        // for small batches use more simdgroups (needs more tests, to confirm if it's worth it)
                        const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)) : 4;
                        const size_t smem = nqptg*(ne00 + nsg*(ncpsg + nqptg))*(sizeof(float)/2);
                        //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
                        GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
@ -2465,10 +2430,13 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe
    UNUSED(buft);
 }
-static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
+static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
 #ifndef GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
    if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
                __func__,
                size_aligned / 1024.0 / 1024.0,
                device.currentAllocatedSize / 1024.0 / 1024.0,
                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
@ -2478,10 +2446,15 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
            GGML_METAL_LOG_INFO("\n");
        }
    } else {
-        GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
                __func__,
                size_aligned / 1024.0 / 1024.0,
                device.currentAllocatedSize / 1024.0 / 1024.0);
    }
 #endif
 #endif
    UNUSED(device);
    UNUSED(size_aligned);
 }
 GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -2515,8 +2488,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
        return NULL;
    }
-    GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+    ggml_backend_metal_log_allocated_size(device, size_aligned);
    ggml_backend_metal_log_allocated_size(device);
    return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }
@ -2544,6 +2516,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
@ -2592,7 +2565,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
            return false;
        }
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+        ggml_backend_metal_log_allocated_size(device, size_aligned);
        ++ctx->n_buffers;
    } else {
@ -2615,7 +2588,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
                return false;
            }
-            GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
+            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
            if (i + size_step < size) {
                GGML_METAL_LOG_INFO("\n");
            }
@ -2624,8 +2598,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
        }
    }
    ggml_backend_metal_log_allocated_size(device);
    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1995,6 +1995,7 @@ typedef void (flash_attn_ext_f16_t)(
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]);
 // ref: https://arxiv.org/pdf/2307.08691.pdf
 template<int64_t D, int64_t Q, int64_t C> // head size, queries per threadgroup, cache items per threadgroup
 kernel void kernel_flash_attn_ext_f16(
        device const  char * q,
@ -2038,39 +2039,45 @@ kernel void kernel_flash_attn_ext_f16(
    const int64_t iq1 = tgpig[0]*Q;
    const int64_t D4 = D/4;
    const int64_t N4 = N_SIMDWIDTH;
    const int64_t L4 = (D4 + N4 - 1)/N4;
    const int64_t D8 = D/8;
    const int64_t Q8 = Q/8;
    const int64_t NW = N_SIMDWIDTH;
    const int64_t SH = (C + Q); // shared memory per simdgroup in (half)
-    const int64_t T  = D + nsg*(D + 1*C); // shared memory size per query in half
+    const int64_t T  = D + nsg*SH; // shared memory size per query in (half)
-    const int64_t T4 = T/4;               // shared memory size per query in half4
+    const int64_t T4 = T/4;        // shared memory size per query in (half4)
-    threadgroup half  * pq  = (threadgroup half  *) (shared +                   0*D);
+    threadgroup half  * sq  = (threadgroup half  *) (shared +            0*D); // holds the query data
-    threadgroup half4 * pq4 = (threadgroup half4 *) (shared +                   0*D);
+    threadgroup half4 * sq4 = (threadgroup half4 *) (shared +            0*D); // same as above but in half4
-    threadgroup half  * ps  = (threadgroup half  *) (shared + sgitg*(D + 1*C) + 1*D);
+    threadgroup half  * ss  = (threadgroup half  *) (shared + sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
-    threadgroup half4 * ps4 = (threadgroup half4 *) (shared + sgitg*(D + 1*C) + 1*D);
+
-    threadgroup half  * ss  = (threadgroup half  *) (shared + sgitg*(D + 1*C) + 2*D);
+    // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
    simdgroup_half8x8 lo[Q8][D8];
    for (int64_t i = 0; i < L4; ++i) {
    // load heads from Q to shared memory
    for (int64_t j = sgitg; j < Q; j += nsg) {
        device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
        for (int64_t i = tiisg; i < D4; i += NW) {
            if (iq1 + j < ne01) {
-                pq4[j*T4 + N4*i + tiisg] = (half4) q4[N4*i + tiisg];
+                sq4[j*T4 + i] = (half4) q4[i];
            } else {
-                pq4[j*T4 + N4*i + tiisg] = 0.0h;
+                sq4[j*T4 + i] = 0.0h;
            }
        }
    }
-        // zero out shared memory
+    // zero out lo
-        for (int64_t j = 0; j < Q; ++j) {
+    for (int64_t j = 0; j < Q8; ++j) {
-            ps4[j*T4 + N4*i + tiisg] = 0.0h;
+        for (int64_t i = 0; i < D8; ++i) {
            lo[j][i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
        }
    }
-    if (tiisg < C) {
+    // zero out shared memory SH
    for (int64_t j = 0; j < Q; ++j) {
-            ss[j*T + 0 + tiisg] = 0.0h;
+        for (int64_t i = tiisg; i < SH; i += NW) {
            ss[j*T + i] = 0.0h;
        }
    }
@ -2103,79 +2110,68 @@ kernel void kernel_flash_attn_ext_f16(
        const int64_t iv2 = iq2 / rv2;
        const int64_t iv3 = iq3 / rv3;
-        simdgroup_half8x8 mq[D8];
+        // load the queries from shared memory into local memory
        simdgroup_half8x8 mq[Q8][D8];
        for (int64_t j = 0; j < Q8; ++j) {
            for (int64_t i = 0; i < D8; ++i) {
-            simdgroup_load(mq[i], pq + i*8, T);
+                simdgroup_load(mq[j][i], sq + 8*j*T + i*8, T);
            }
        }
        // TODO: this can be improved
        device const float * mp[Q];
        {
        const int64_t ir = iq3*ne02*ne01 + iq2*ne01 + iq1;
-            for (int64_t j = 0; j < Q; ++j) {
+        // pointer to the mask
-                if (iq1 + j < ne01) {
+        device const float * mp = (device const float *) (mask + (ir%ne31)*nb31);
                    mp[j] = (device const float *) (mask + ((ir + j)%ne31)*nb31);
                } else {
                    mp[j] = nullptr;
                }
            }
        }
        // prepare diagonal scale matrix
-        simdgroup_half8x8 mscale(scale);
+        simdgroup_float8x8 mscale(scale);
        for (int64_t iic = C*sgitg; iic < ne11; iic += C*nsg) {
            // skip -INF blocks
            // TODO: double-check this
            {
                float smc = -INFINITY;
                for (int64_t j = 0; j < Q; ++j) {
                    const float mc = mp[j] ? mp[j][iic + tiisg] : -INFINITY;
                    smc = simd_max(max(smc, mc));
                }
                if (smc == -INFINITY) {
                    continue;
                }
            }
        // loop over the KV cache
        // each simdgroup handles blocks of Q rows and C columns
        for (int64_t ic = C*sgitg; ic < ne11; ic += C*nsg) {
            // Q*K^T
            {
                simdgroup_half8x8 mk;
                for (int cc = 0; cc < C/8; ++cc) {
-                    simdgroup_half8x8 mqk = make_filled_simdgroup_matrix<half, Q>(0.h);
+                    simdgroup_half8x8 mqk[Q8];
                    for (int64_t j = 0; j < Q8; ++j) {
                        mqk[j] = make_filled_simdgroup_matrix<half, 8>(0.h);
                    }
-                    device const half * pk = (device const half *) ((device const char *) k + ((iic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
+                    device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
                    for (int64_t i = 0; i < D8; ++i) {
-                        simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true);
+                        simdgroup_half8x8 mk;
                        simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
-                        simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
+                        for (int64_t j = 0; j < Q8; ++j) {
                            simdgroup_multiply_accumulate(mqk[j], mq[j][i], mk, mqk[j]);
                        }
                    }
                    // mqk = mqk*scale + mask
                    for (int64_t j = 0; j < Q8; ++j) {
                        simdgroup_float8x8 mm;
-                    simdgroup_load(mm, mp[0] + iic + 8*cc, nb31/sizeof(float), 0, false);
+                        simdgroup_load(mm, mp + 8*j*(nb31/sizeof(float)) + ic + 8*cc, nb31/sizeof(float), 0, false);
-                    simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
+                        simdgroup_multiply_accumulate(mqk[j], mqk[j], mscale, mm);
-                    simdgroup_store(mqk, ss + 8*cc, T, 0, false);
+                        simdgroup_store(mqk[j], ss + 8*j*T + 8*cc, T, 0, false);
                    }
                }
            }
            // used to detect blocks full of -INF
            half smax = -INFINITY;
            // online softmax
            if (C == 32) {
                for (int64_t j = 0; j < Q; ++j) {
                    const int64_t p = tiisg;
-                //const half s = ss[j*T + p]*scale + (mp[j][iic + p]);
+                    const half m = M[j];
                    const half s = ss[j*T + p];
-                half m = M[j];
+                    smax = simd_max(max(smax, s));
                    M[j] = simd_max(max(M[j], s));
                    const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
@ -2183,42 +2179,83 @@ kernel void kernel_flash_attn_ext_f16(
                    S[j] = S[j]*ms + simd_sum(vs);
-                for (int64_t i = 0; i < L4; ++i) {
+                    // create a QxQ diagonal matrix for rescaling the output
-                    ps4[j*T4 + N4*i + tiisg] *= ms;
+                    if (p == j) {
                        ss[j*T + C + j] = ms;
                    }
                    // the P matrix from the paper (Q rows, C columns)
                    ss[j*T + p] = vs;
                }
            } else {
                for (int64_t j = 0; j < Q; ++j) {
                    const half m = M[j];
-            simdgroup_barrier(mem_flags::mem_none);
+                    for (int64_t p = tiisg; p < C; p += NW) {
                        const half s = ss[j*T + p];
-            // (Q*K^T)*V
+                        smax = simd_max(max(smax, s));
-            {
+                        M[j] = simd_max(max(M[j], s));
                simdgroup_half8x8 mv;
                simdgroup_half8x8 mp[C/8];
                for (int cc = 0; cc < C/8; ++cc) {
                    simdgroup_load(mp[cc], ss + 8*cc, T, 0, false);
                    }
                    const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
                    S[j] = S[j]*ms;
                    // create a QxQ diagonal matrix for rescaling the output
                    if (tiisg == j) {
                        ss[j*T + C + j] = ms;
                    }
                    for (int64_t p = tiisg; p < C; p += NW) {
                        const half s = ss[j*T + p];
                        const half vs = s == -INFINITY ? 0.0h : exp(s - M[j]);
                        S[j] = S[j] + simd_sum(vs);
                        // the P matrix from the paper (Q rows, C columns)
                        ss[j*T + p] = vs;
                    }
                }
            }
            // skip -INF blocks
            if (smax == -INFINITY) {
                continue;
            }
            // O = diag(ms)*O
            for (int64_t j = 0; j < Q8; ++j) {
                simdgroup_half8x8 mm;
                simdgroup_load(mm, ss + 8*j*T + C + 8*j, T, 0, false);
                for (int64_t i = 0; i < D8; ++i) {
-                    simdgroup_half8x8 mqkv;
+                    simdgroup_multiply(lo[j][i], mm, lo[j][i]);
-
+                }
-                    simdgroup_load(mqkv, ps + i*8, T, 0, false);
+            }
            // O = O + (Q*K^T)*V
            {
                for (int cc = 0; cc < C/8; ++cc) {
-                        device const half * pv = (device const half *) ((device const char *) v + ((iic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
+                    device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
-                        simdgroup_load(mv, pv + i*8, nb21/sizeof(half), 0, false);
+                    for (int64_t i = 0; i < D8; ++i) {
                        simdgroup_half8x8 mk;
                        simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
-                        simdgroup_multiply_accumulate(mqkv, mp[cc], mv, mqkv);
+                        for (int64_t j = 0; j < Q8; ++j) {
                            simdgroup_half8x8 mv;
                            simdgroup_load(mv, ss + 8*j*T + 8*cc, T, 0, false);
                            simdgroup_multiply_accumulate(lo[j][i], mv, mk, lo[j][i]);
                        }
                    }
                    simdgroup_store(mqkv, ps + i*8, T, 0, false);
                }
            }
        }
        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
        for (int64_t j = 0; j < Q; ++j) {
            if (tiisg == 0) {
                ss[j*T + 0] = S[j];
@ -2227,91 +2264,86 @@ kernel void kernel_flash_attn_ext_f16(
        }
    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // reduce the warps sequentially
    // reduce the warps
 #if 1
    if (sgitg == 0) {
        half S = { 0.0h };
        half M = { -INFINITY };
    for (int64_t sg = 1; sg < nsg; ++sg) {
            for (int64_t j = 0; j < Q; ++j) {
                const half S0 = ss[j*T +                0];
                const half S1 = ss[j*T + sg*(D + 1*C) + 0];
                const half M0 = ss[j*T +                1];
                const half M1 = ss[j*T + sg*(D + 1*C) + 1];
                M = max(M0, M1);
                const half ms0 = exp(M0 - M);
                const half ms1 = exp(M1 - M);
                S = S0*ms0 + S1*ms1;
                if (tiisg == 0) {
                    ss[j*T + 0] = S;
                    ss[j*T + 1] = M;
                }
                for (int64_t i = 0; i < L4; ++i) {
                    ps4[j*T4 + N4*i + tiisg] = ps4[j*T4 + N4*i + tiisg]*ms0 + ps4[j*T4 + sg*(D + 1*C)/4 + N4*i + tiisg]*ms1;
                }
            }
        }
    }
 #else
    // parallel reduce
    // NOTE: this is significantly slower than the serial version above, likely due to the small number of warps
    {
        half S = { 0.0h };
        half M = { -INFINITY };
-        for (int64_t sg = nsg/2; sg > 0; sg /= 2) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
-            if (sgitg >= sg) {
+
-                continue;
+        // each simdgroup stores its output to shared memory, reusing sq
        if (sgitg == sg) {
            for (int64_t j = 0; j < Q8; ++j) {
                for (int64_t i = 0; i < D8; ++i) {
                    simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
                }
            for (int64_t j = 0; j < Q; ++j) {
                const half S0 = ss[j*T +                0];
                const half S1 = ss[j*T + sg*(D + 1*C) + 0];
                const half M0 = ss[j*T +                1];
                const half M1 = ss[j*T + sg*(D + 1*C) + 1];
                M = max(M0, M1);
                const half ms0 = exp(M0 - M);
                const half ms1 = exp(M1 - M);
                S = S0*ms0 + S1*ms1;
                if (tiisg == 0) {
                    ss[j*T + 0] = S;
                    ss[j*T + 1] = M;
                }
                for (int64_t i = 0; i < L4; ++i) {
                    ps4[j*T4 + N4*i + tiisg] = ps4[j*T4 + N4*i + tiisg]*ms0 + ps4[j*T4 + sg*(D + 1*C)/4 + N4*i + tiisg]*ms1;
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
        }
    }
 #endif
-    simdgroup_barrier(mem_flags::mem_threadgroup);
+        // the first simdgroup accumulates the results from the other simdgroups
        if (sgitg == 0) {
            for (int64_t j = 0; j < Q; ++j) {
                const half S0 = ss[j*T +         0];
                const half S1 = ss[j*T + sg*SH + 0];
                const half M0 = ss[j*T +         1];
                const half M1 = ss[j*T + sg*SH + 1];
                M = max(M0, M1);
                const half ms0 = M0 == -INFINITY ? 0.0h : exp(M0 - M);
                const half ms1 = M1 == -INFINITY ? 0.0h : exp(M1 - M);
                S = S0*ms0 + S1*ms1;
                if (tiisg == 0) {
                    ss[j*T + 0] = S;
                    ss[j*T + 1] = M;
                    ss[j*T + C + j        ] = ms0;
                    ss[j*T + C + j + sg*SH] = ms1;
                }
            }
            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
            for (int64_t j = 0; j < Q8; ++j) {
                simdgroup_half8x8 t;
                simdgroup_half8x8 ms0;
                simdgroup_half8x8 ms1;
                simdgroup_load(ms0, ss + 8*j*T + C + 8*j,         T, 0, false);
                simdgroup_load(ms1, ss + 8*j*T + C + 8*j + sg*SH, T, 0, false);
                for (int64_t i = 0; i < D8; ++i) {
                    simdgroup_load    (t, sq + 8*j*T + i*8, T, 0, false);
                    simdgroup_multiply(t, ms1, t);
                    simdgroup_multiply_accumulate(lo[j][i], ms0, lo[j][i], t);
                }
            }
        }
    }
    // store result to shared memory (reuse sq)
    if (sgitg == 0) {
        for (int64_t j = 0; j < Q8; ++j) {
            for (int64_t i = 0; i < D8; ++i) {
                simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
            }
        }
    }
    device float4 * dst4 = (device float4 *) dst;
    // final rescale with 1/S and store to global memory
    if (sgitg == 0) {
        for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) {
            const half S = ss[j*T + 0];
-            for (int64_t i = 0; i < L4; ++i) {
+            for (int64_t i = tiisg; i < D4; i += NW) {
-                dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + N4*i + tiisg] = (float4) ps4[j*T4 + N4*i + tiisg]/S;
+                dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
            }
        }
    }
@ -2319,7 +2351,10 @@ kernel void kernel_flash_attn_ext_f16(
 template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64,  8, 32>;
 template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80,  8, 32>;
 template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96,  8, 32>;
 template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112, 8, 32>;
 template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128, 8, 32>;
 template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256, 8, 32>;
 kernel void kernel_cpy_f16_f16(
        device  const half * src0,
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
        dst[row] = tmp[0];
    }
 }
 );
@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
        dst[row] = tmp[0];
    }
 }
 );
@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
 }
 );
 std::string add_template = MULTILINE_QUOTE(
 __kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
    if (i >= get_global_size(0)) {
        return;
    }
    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
 }
 );
 #define CL_CHECK(err)                                               \
    do {                                                            \
        cl_int err_ = (err);                                        \
@ -878,6 +890,7 @@ static std::string generate_kernels() {
        }
        src << mul_kernel << '\n';
    }
    src << add_template << '\n';
    return src.str();
 }
@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
 static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
 static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
 static cl_kernel mul_f32_cl;
 static cl_kernel add_f32_cl;
 static bool fp16_support;
 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
    char *ext_buffer = (char *)alloca(ext_str_size + 1);
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
    // Disabled due to faulty outputs
    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
    cl_context_properties properties[] = {
        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
    // mul kernel
    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
 }
 static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
    ggml_cl_mul_f32(src0, src1, dst);
 }
 static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    size_t x_size;
    size_t d_size;
    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            cl_event ev;
            // copy src0 to device
            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
            const int64_t i13 = i03%ne13;
            const int64_t i12 = i02%ne12;
            const int i1 = i13*ne12*ne11 + i12*ne11;
            cl_int x_offset = 0;
            cl_int y_offset = i1*ne10;
            cl_int d_offset = 0;
            size_t global = ne00 * ne01;
            cl_int ky = ne10 * ne11;
            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
            CL_CHECK(clReleaseEvent(ev));
            CL_CHECK(clFinish(queue));
            // copy dst to host
            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
        }
    }
    ggml_cl_pool_free(d_X, x_size);
    ggml_cl_pool_free(d_D, d_size);
 }
 void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cl_add_f32(src0, src1, dst);
 }
 static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -2055,6 +2136,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // TODO: return from device info
    /* .get_alloc_size   = */ NULL,
    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
@ -2111,6 +2193,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -10,6 +10,7 @@ extern "C" {
 GGML_API void ggml_cl_init(void);
 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -0,0 +1,27 @@
 /*MIT license
  Copyright (C) 2024 Intel Corporation
  SPDX-License-Identifier: MIT
 */
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_SYCL_MAX_DEVICES       16
 #define GGML_SYCL_NAME "SYCL"
 GGML_API void   ggml_init_sycl(void);
 GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -0,0 +1,34 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_VK_NAME "Vulkan"
 GGML_API void ggml_vk_init(void);
 GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers(void);
 GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
 GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
 void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #endif
 GGML_API void ggml_vk_graph_cleanup(void);
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
 GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -248,6 +248,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
 #elif defined(GGML_USE_VULKAN)
 #include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
 #include "ggml-sycl.h"
 #endif
 // floating point type used to accumulate sums
@ -1344,12 +1348,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
    // leftovers
    for (int i = np; i < n; ++i) {
-        y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #else
    // scalar
    for (int i = 0; i < n; ++i) {
-        y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v);
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
    }
 #endif
 }
@ -1478,6 +1482,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 // TODO: optimize performance
 inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@ -1838,9 +1845,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "GELU",
    "GELU_QUICK",
    "SILU",
    "HARDSWISH",
    "HARDSIGMOID",
 };
-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
+static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@ -2350,6 +2359,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        ggml_init_cublas();
 #elif defined(GGML_USE_CLBLAST)
        ggml_cl_init();
 #elif defined(GGML_USE_VULKAN)
        ggml_vk_init();
 #elif defined(GGML_USE_SYCL)
        ggml_init_sycl();
 #endif
        ggml_setup_op_has_task_pass();
@ -4007,6 +4020,20 @@ struct ggml_tensor * ggml_silu_back(
    return result;
 }
 // ggml hardswish
 struct ggml_tensor * ggml_hardswish(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
 }
 // ggml hardsigmoid
 struct ggml_tensor * ggml_hardsigmoid(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
 }
 // ggml_norm
 static struct ggml_tensor * ggml_norm_impl(
@ -5408,6 +5435,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
    return result;
 }
 // ggml_conv_depthwise
 struct ggml_tensor * ggml_conv_depthwise_2d(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
    struct ggml_tensor * b,
    int                  s0,
    int                  s1,
    int                  p0,
    int                  p1,
    int                  d0,
    int                  d1) {
    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
    return result;
 }
 // ggml_conv_2d
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@ -7278,6 +7330,17 @@ static void ggml_compute_forward_add_f32(
    const int ith = params->ith;
    const int nth = params->nth;
 #ifdef GGML_USE_CLBLAST
    if (src1->backend == GGML_BACKEND_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
        if (ith == 0) {
            ggml_cl_add(src0, src1, dst);
        }
        return;
    }
 #endif
    const int nr  = ggml_nrows(src0);
    GGML_TENSOR_BINARY_OP_LOCALS
@ -7558,7 +7621,12 @@ static void ggml_compute_forward_add(
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                if (src1->type == GGML_TYPE_F32) {
                    ggml_compute_forward_add_f32(params, src0, src1, dst);
                }
                else {
                    GGML_ASSERT(false);
                }
            } break;
        case GGML_TYPE_F16:
            {
@ -7879,6 +7947,9 @@ static void ggml_compute_forward_acc_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@ -8067,7 +8138,7 @@ static void ggml_compute_forward_mul_f32(
    const int ith = params->ith;
    const int nth = params->nth;
-#ifdef GGML_USE_CLBLAST
+#if defined(GGML_USE_CLBLAST)
    if (src1->backend == GGML_BACKEND_GPU) {
        // TODO: OpenCL kernel support full broadcast
        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
@ -9448,6 +9519,87 @@ static void ggml_compute_forward_silu_back(
    }
 }
 static void ggml_compute_forward_hardswish_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_hardswish_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_hardswish(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardswish_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 static void ggml_compute_forward_hardsigmoid_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    assert(params->ith == 0);
    assert(ggml_are_same_shape(src0, dst));
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    assert(dst->nb[0]  == sizeof(float));
    assert(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_hardsigmoid_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_hardsigmoid(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_norm
 static void ggml_compute_forward_norm_f32(
@ -9940,11 +10092,30 @@ static void ggml_compute_forward_mul_mat(
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
+        const int64_t ne_plane      = ne01*ne00;
-            return;
+        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-        }
+        UNUSED(desired_wsize);
        if (params->type == GGML_TASK_INIT) {
            if (type != GGML_TYPE_F32) {
                assert(params->wsize >= desired_wsize);
                // parallelize by src0 rows
                for (int64_t i13 = 0; i13 < ne13; i13++) {
                    for (int64_t i12 = 0; i12 < ne12; i12++) {
                        // broadcast src0 into src1 across 2nd,3rd dimension
                        const int64_t i03 = i13/r3;
                        const int64_t i02 = i12/r2;
                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                              ggml_to_float_t  const to_float = type_traits[type].to_float;
                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
                        }
                    }
                }
            }
            return;
        }
@ -9952,9 +10123,14 @@ static void ggml_compute_forward_mul_mat(
            return;
        }
        // perform sgemm, parallelization controlled by blas lib
        if (ith != 0) {
            return;
        }
        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
                // broadcast src0 into src1 across 2nd,3rd dimension
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;
@ -9963,17 +10139,7 @@ static void ggml_compute_forward_mul_mat(
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
                if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                    ggml_to_float_t const to_float = type_traits[type].to_float;
                    size_t id = 0;
                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
                        id += ne00;
                    }
                    assert(id*sizeof(float) <= params->wsize);
                    x = wdata;
                }
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -9983,6 +10149,7 @@ static void ggml_compute_forward_mul_mat(
                         0.0f,    d, ne01);
            }
        }
        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -9991,6 +10158,9 @@ static void ggml_compute_forward_mul_mat(
 #endif
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10155,6 +10325,9 @@ static void ggml_compute_forward_mul_mat_id(
    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
   if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -10340,6 +10513,9 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@ -10523,6 +10699,9 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@ -10707,6 +10886,9 @@ static void ggml_compute_forward_set_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (params->ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@ -11031,6 +11213,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
    if (params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }
@ -11065,6 +11250,9 @@ static void ggml_compute_forward_get_rows_back_f32(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
    if (params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }
@ -11202,6 +11390,9 @@ static void ggml_compute_forward_diag_mask_f32(
    GGML_ASSERT(n_past >= 0);
    if (!inplace && (params->type == GGML_TASK_INIT)) {
        if (ith != 0) {
            return;
        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@ -12172,6 +12363,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12266,6 +12460,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@ -12464,6 +12661,7 @@ static void ggml_compute_forward_im2col(
    }
 }
 // ggml_compute_forward_conv_transpose_2d
 static void ggml_compute_forward_conv_transpose_2d(
@ -12489,6 +12687,9 @@ static void ggml_compute_forward_conv_transpose_2d(
    GGML_ASSERT(nb10 == sizeof(float));
    if (params->type == GGML_TASK_INIT) {
        if (ith != 0) {
            return;
        }
        memset(params->wdata, 0, params->wsize);
        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@ -13353,11 +13554,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
    const int64_t D = neq0;
    const int64_t N = neq1;
    const int64_t P = nek1 - N;
    GGML_ASSERT(ne0 == D);
    GGML_ASSERT(ne2 == N);
    GGML_ASSERT(P >= 0);
    GGML_ASSERT(nbq0 == sizeof(float));
    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
@ -13368,7 +13567,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
    GGML_ASSERT(nev0 == D);
    GGML_ASSERT(neq1 == N);
    GGML_ASSERT(nek1 == N + P);
    GGML_ASSERT(nev0 == D);
    // dst cannot be transposed or permuted
@ -13407,8 +13605,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
    float scale = 1.0f;
    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
    // loop over n_batch and n_head
    for (int ir = ir0; ir < ir1; ++ir) {
        // q indices
@ -14228,6 +14424,14 @@ static void ggml_compute_forward_unary(
            {
                ggml_compute_forward_silu(params, src0, dst);
            } break;
        case GGML_UNARY_OP_HARDSWISH:
            {
                ggml_compute_forward_hardswish(params, src0, dst);
            } break;
        case GGML_UNARY_OP_HARDSIGMOID:
            {
                ggml_compute_forward_hardsigmoid(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
@ -14291,6 +14495,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
    if (!inplace && params->type == GGML_TASK_INIT) {
        if (params->ith != 0) {
            return;
        }
        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
        return;
    }
@ -14806,8 +15013,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    }
    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #elif defined(GGML_USE_VULKAN)
    const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
    if (skip_cpu) {
        ggml_vk_check_results_1(params, tensor);
    }
 #endif
    if (skip_cpu) {
        return;
    }
    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS
 #ifdef GGML_USE_SYCL
    bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
    if (skip_cpu) {
        return;
    }
 #endif // GGML_USE_SYCL
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
@ -16591,6 +16816,7 @@ struct ggml_compute_state_shared {
    // synchronization primitives
    atomic_int n_active;  // num active threads
    atomic_int node_n;    // active graph node
    atomic_int node_task; // active graph node task phase
    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
    void * abort_callback_data;
@ -16646,6 +16872,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
                        n_tasks = 1;
                    } break;
@ -16722,7 +16950,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_SOFT_MAX:
            {
-                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
@ -16837,6 +17065,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
    return n_tasks;
 }
 static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_node_n = * node_n;
    while (true) {
        if (do_yield) {
            sched_yield();
        }
        * node_n = atomic_load(&state->shared->node_n);
        if (* node_n != last_node_n) break;
    }
 }
 static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
    // wait for other threads to finish
    const int last_task_phase = * task_phase;
    while (true) {
        if (do_yield) {
            sched_yield();
        }
        * task_phase = atomic_load(&state->shared->node_task);
        if (* task_phase != last_task_phase) break;
    }
 }
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16848,6 +17104,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    set_numa_thread_affinity(state->ith, n_threads);
    int node_n     = -1;
    int task_phase = GGML_TASK_FINALIZE;
    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@ -16879,7 +17136,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            // distribute new work or execute it direct if 1T
            while (++node_n < cgraph->n_nodes) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                struct ggml_tensor * node = cgraph->nodes[node_n];
                const int n_tasks = ggml_get_n_tasks(node, n_threads);
@ -16888,13 +17144,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                params.nth = n_tasks;
                if (n_tasks == 1) {
                    /* INIT */
                    if (GGML_OP_HAS_INIT[node->op]) {
                        params.type = GGML_TASK_INIT;
                        ggml_compute_forward(&params, node);
                    }
                if (n_tasks == 1) {
                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_COMPUTE;
@ -16915,38 +17171,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                }
            }
            task_phase = GGML_TASK_INIT;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_n,    node_n);
            atomic_store(&state->shared->node_task, task_phase);
        } else {
-            // wait for other threads to finish
+            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
-            const int last = node_n;
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
            while (true) {
                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
                //       depending on the workload and the operating system.
                //       since it is not clear what is the best approach, it should potentially become user-configurable
                //       ref: https://github.com/ggerganov/ggml/issues/291
                // UPD:  adding the do_yield flag seems to resolve the issue universally
                if (do_yield) {
                    sched_yield();
                }
                node_n = atomic_load(&state->shared->node_n);
                if (node_n != last) break;
            };
        }
        // check if we should stop
        if (node_n >= cgraph->n_nodes) break;
-        /* COMPUTE */
+        /* INIT & COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads);
        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.type  =*/ GGML_TASK_INIT,
            /*.ith   =*/ state->ith,
            /*.nth   =*/ n_tasks,
            /*.wsize =*/ cplan->work_size,
@ -16954,10 +17196,41 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        };
        if (state->ith < n_tasks) {
            if (GGML_OP_HAS_INIT[node->op]) {
                ggml_compute_forward(&params, node);
            }
        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_COMPUTE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
            //       depending on the workload and the operating system.
            //       since it is not clear what is the best approach, it should potentially become user-configurable
            //       ref: https://github.com/ggerganov/ggml/issues/291
            // UPD:  adding the do_yield flag seems to resolve the issue universally
            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
        }
        if (state->ith < n_tasks) {
            params.type = GGML_TASK_COMPUTE;
            ggml_compute_forward(&params, node);
        }
        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
            task_phase = GGML_TASK_FINALIZE;
            atomic_store(&state->shared->n_active,  n_threads);
            atomic_store(&state->shared->node_task, task_phase);
        }
        else {
            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
        }
    }
    return GGML_EXIT_SUCCESS;
 }
@ -17012,8 +17285,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
+                            // here we need memory for fully dequantized matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            // take into account that src0 can be broadcasted into src1[2,3]
                            cur = ggml_type_size(GGML_TYPE_F32)
                                * node->src[0]->ne[0]*node->src[0]->ne[1]
                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                        }
                    } else
 #endif
@ -17163,6 +17439,17 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        }
    }
 #ifdef GGML_USE_VULKAN
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
    }
    ggml_vk_preallocate_buffers();
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
    }
 #endif
    const int n_threads = cplan->n_threads;
    struct ggml_compute_state_shared state_shared = {
@ -17173,6 +17460,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
        /*.node_task               =*/ GGML_TASK_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
@ -17213,6 +17501,10 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        }
    }
 #ifdef GGML_USE_VULKAN
    ggml_vk_graph_cleanup();
 #endif
    // performance stats (graph)
    {
        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
@ -20347,7 +20639,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
@ -20370,8 +20662,24 @@ int ggml_cpu_has_clblast(void) {
 #endif
 }
 int ggml_cpu_has_vulkan(void) {
 #if defined(GGML_USE_VULKAN)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_sycl(void) {
 #if defined(GGML_USE_SYCL)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
 }
 int ggml_cpu_has_sse3(void) {
--- a/ggml.h
+++ b/ggml.h
@ -490,6 +490,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_COUNT,
    };
@ -1033,6 +1035,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // hardswish(x) = x * relu6(x + 3) / 6
    GGML_API struct ggml_tensor * ggml_hardswish(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // hardsigmoid(x) = relu6(x + 3) / 6
    GGML_API struct ggml_tensor * ggml_hardsigmoid(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1484,6 +1496,17 @@ extern "C" {
            int                  d1,
            bool                 is_2D);
    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int                  s0,
            int                  s1,
            int                  p0,
            int                  p1,
            int                  d0,
            int                  d1);
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -2258,9 +2281,11 @@ extern "C" {
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_vulkan     (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -101,6 +101,7 @@ class MODEL_ARCH(IntEnum):
    PHI2      = auto()
    PLAMO     = auto()
    CODESHELL = auto()
    ORION     = auto()
 class MODEL_TENSOR(IntEnum):
@ -151,6 +152,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.PHI2:           "phi2",
    MODEL_ARCH.PLAMO:          "plamo",
    MODEL_ARCH.CODESHELL:      "codeshell",
    MODEL_ARCH.ORION:          "orion",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -427,7 +429,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
-    ]
+    ],
    MODEL_ARCH.ORION: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    # TODO
 }
@ -452,6 +470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.ORION: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
 }
 #
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -107,7 +107,7 @@ class GGUFReader:
        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
        new_align = self.fields.get('general.alignment')
        if new_align is not None:
-            if new_align.types != [GGUFValueType.UINT64]:
+            if new_align.types != [GGUFValueType.UINT32]:
                raise ValueError('Bad type for general.alignment field')
            self.alignment = new_align.parts[-1][0]
        padding = offs % self.alignment
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -6,6 +6,9 @@
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 #elif defined(GGML_USE_SYCL)
 #include "ggml-sycl.h"
 #define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
@ -46,7 +49,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 4
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@ -107,6 +110,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -774,6 +778,14 @@ extern "C" {
                           float   p,
                          size_t   min_keep);
    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
    LLAMA_API void llama_sample_entropy(
            struct llama_context * ctx,
          llama_token_data_array * candidates_p,
                           float   min_temp,
                           float   max_temp,
                           float   exponent_val);
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
--- a/mypy.ini
+++ b/mypy.ini
@ -4,3 +4,4 @@ allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
 disable_error_code = import-untyped
 warn_return_any = false
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@ -243,7 +243,6 @@ int main(int argc, char** argv) {
    if (useQ4_1) q41.resize(n4);
    else q40.resize(n4);
    std::vector<block_q8_0> q8(n8);
    std::vector<int64_t> H(16, 0);
    double sumt = 0, sumt2 = 0, maxt = 0;
    double sumqt = 0, sumqt2 = 0, maxqt = 0;
    double sum = 0, sumq = 0, exactSum = 0;
--- a/scripts/ci-run.sh
+++ b/scripts/ci-run.sh
@ -0,0 +1,50 @@
 #!/bin/bash
 set -euo pipefail
 this=$(realpath "$0"); readonly this
 cd "$(dirname "$this")"
 shellcheck "$this"
 if (( $# != 1 && $# != 2  )); then
    cat >&2 <<'EOF'
 usage:
    ci-run.sh <tmp_dir> [<cache_dir>]
 This script wraps ci/run.sh:
 * If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
    (openllama_3b_v2: quantized models are about 30GB)
 * Persistent model and data files are synced to and from <cache_dir>,
    excluding generated .gguf files.
    (openllama_3b_v2: persistent files are about 6.6GB)
 * <cache_dir> defaults to  ~/.cache/llama.cpp
 EOF
    exit 1
 fi
 cd .. # => llama.cpp repo root
 tmp="$1"
 mkdir -p "$tmp"
 tmp=$(realpath "$tmp")
 echo >&2 "Using tmp=$tmp"
 cache="${2-$HOME/.cache/llama.cpp}"
 mkdir -p "$cache"
 cache=$(realpath "$cache")
 echo >&2 "Using cache=$cache"
 _sync() {
    local from="$1"; shift
    local to="$1"; shift
    echo >&2 "Syncing from $from to $to"
    mkdir -p "$from" "$to"
    rsync -a "$from" "$to" --delete-during "$@"
 }
 _sync "$(realpath .)/" "$tmp/llama.cpp"
 _sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
 cd "$tmp/llama.cpp"
 bash ci/run.sh ci-out ci-mnt
 _sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@ -46,7 +46,7 @@ Formatting considerations:
 - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
 - To define a tensor split, pass a list of floats.
 """
-usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
+usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
 epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
          "Unknown args will be ignored.")
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-6c1ce0bd591a430c1d3f6797d905194581c878c1
+f2a9472b23cf27e672ed70a2a6eb078f7b060f18
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -0,0 +1,3 @@
 *
 !*.*
 test-c.o
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,6 +1,6 @@
 function(llama_build_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source})
+    add_executable(${TEST_TARGET} ${source} get-model.cpp)
    install(TARGETS ${TEST_TARGET} RUNTIME)
    target_link_libraries(${TEST_TARGET} PRIVATE common)
 endfunction()
@ -8,14 +8,20 @@ endfunction()
 function(llama_test_executable name source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
    set_property(TEST ${name} PROPERTY LABELS "main")
 endfunction()
 function(llama_build_and_test_executable source)
    llama_build_and_test_executable_with_label(${source} "main")
 endfunction()
 function(llama_build_and_test_executable_with_label source label)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source})
+    add_executable(${TEST_TARGET} ${source} get-model.cpp)
    install(TARGETS ${TEST_TARGET} RUNTIME)
    target_link_libraries(${TEST_TARGET} PRIVATE common)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
 endfunction()
 # llama_build_and_test_executable(test-double-float.cpp) # SLOW
@ -49,12 +55,14 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
 llama_build_and_test_executable(test-backend-ops.cpp)
 llama_build_and_test_executable(test-autorelease.cpp)
 llama_build_and_test_executable(test-rope.cpp)
 llama_build_executable(test-flash-attention.cpp)
 llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
 llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
--- a/tests/get-model.cpp
+++ b/tests/get-model.cpp
@ -0,0 +1,21 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include "get-model.h"
 char * get_model_or_exit(int argc, char *argv[]) {
    char * model_path;
    if (argc > 1) {
        model_path = argv[1];
    } else {
        model_path = getenv("LLAMACPP_TEST_MODELFILE");
        if (!model_path || strlen(model_path) == 0) {
            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
            exit(EXIT_SUCCESS);
        }
    }
    return model_path;
 }
--- a/tests/get-model.h
+++ b/tests/get-model.h
@ -0,0 +1,2 @@
 #pragma once
 char * get_model_or_exit(int, char*[]);
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@ -5,19 +5,15 @@
 #include <thread>
 #include "llama.h"
 #include "get-model.h"
 // This creates a new context inside a pthread and then tries to exit cleanly.
 int main(int argc, char ** argv) {
-    if (argc < 2) {
+    auto * model_path = get_model_or_exit(argc, argv);
        printf("Usage: %s model.gguf\n", argv[0]);
        return 0; // intentionally return success
    }
-    const std::string fname = argv[1];
+    std::thread([&model_path]() {
    std::thread([&fname]() {
        llama_backend_init(false);
-        auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
+        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
        llama_free(ctx);
        llama_free_model(model);
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -102,7 +102,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                    } else if (t->type == GGML_TYPE_I8) {
                        tv.push_back((float)*(int8_t *) &buf[i]);
                    } else if (quantized) {
                        std::vector<float> vq(ggml_blck_size(t->type));
                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
@ -240,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
 #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
 #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
 #ifdef GGML_USE_SYCL
 static bool inline _isinf(float f) {
    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
 }
 #else
 static bool inline _isinf(float f) { return std::isinf(f); }
 #endif
 // accept FLT_MAX as infinity
 static bool isinf_or_max(float f) {
-    return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
+    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
 }
 static bool ggml_is_view_op(enum ggml_op op) {
@ -1396,7 +1402,7 @@ struct test_flash_attn_ext : public test_case {
    }
    double max_nmse_err() override {
-        return 5e-5;
+        return 5e-4;
    }
    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
@ -1412,6 +1418,48 @@ struct test_flash_attn_ext : public test_case {
    }
 };
 // Attention
 struct test_attn : public test_case {
    const int64_t hs; // head size
    const int64_t nh; // num heads
    const int64_t kv; // kv size
    const int64_t nb; // batch size
    std::string op_desc(ggml_tensor * t) override {
        return "ATTN";
        GGML_UNUSED(t);
    }
    std::string vars() override {
        return VARS_TO_STR4(hs, nh, kv, nb);
    }
    double max_nmse_err() override {
        return 5e-4;
    }
    test_attn(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
        : hs(hs), nh(nh), kv(kv), nb(nb) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
        ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
        ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, hs, nh, 1); // transposed
        ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv, nb, 1, 1);
        struct ggml_tensor * cur;
        cur = ggml_mul_mat     (ctx, k, q);
        cur = ggml_soft_max_ext(ctx, cur, mask, 1.0f/sqrtf(hs));
        cur = ggml_mul_mat     (ctx, v, cur);
        cur = ggml_permute     (ctx, cur, 0, 2, 1, 3);
        cur = ggml_cont_2d     (ctx, cur, hs*nh, nb);
        return cur;
    }
 };
 // Mixtral MOE
 struct test_moe : public test_case {
    const int n_experts;
@ -1678,9 +1726,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_pad());
    test_cases.emplace_back(new test_leaky_relu());
-    test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 8));
+    for (int hs : { 64,  80,  96, 112, 128, 256, }) {
-    test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 7));
+        for (int nh : { 32, }) {
-    test_cases.emplace_back(new test_flash_attn_ext(128, 32, 256, 1));
+            for (int kv : { 512, 1024, 2048, 4096, }) {
                for (int nb : { 1, 2, 4, 8, 512, 1024, 2048, }) {
                    test_cases.emplace_back(new test_attn          (hs, nh, kv, nb));
                    test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
                }
            }
        }
    }
 #if !defined(__SANITIZE_THREAD__)
    // FIXME: these tests use too much memory with thread sanitizer
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -190,7 +190,6 @@ int main()
        index++;
    }
    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
    std::vector<llama_grammar_candidate> next_candidates;
    next_candidates.resize(24);
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@ -0,0 +1,27 @@
 #include "llama.h"
 #include "get-model.h"
 #include <cstdlib>
 int main(int argc, char *argv[] ) {
    auto * model_path = get_model_or_exit(argc, argv);
    auto * file = fopen(model_path, "r");
    if (file == nullptr) {
        fprintf(stderr, "no model at '%s' found\n", model_path);
        return EXIT_FAILURE;
    }
    fprintf(stderr, "using '%s'\n", model_path);
    fclose(file);
    llama_backend_init(false);
    auto params = llama_model_params{};
    params.use_mmap = false;
    params.progress_callback = [](float progress, void * ctx){
        (void) ctx;
        return progress > 0.50;
    };
    auto * model = llama_load_model_from_file(model_path, params);
    llama_backend_free();
    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -5,11 +5,10 @@
 #undef NDEBUG
 #endif
 #include <cmath>
 #include <numeric>
 #include <cassert>
 #include <vector>
 #include <algorithm>
 #include <cmath>
 #include <string>
 #include <vector>
 static void dump(const llama_token_data_array * candidates) {
    for (size_t i = 0; i < candidates->size; i++) {
@ -20,11 +19,11 @@ static void dump(const llama_token_data_array * candidates) {
 #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
 static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
-    size_t n_vocab = probs.size();
+    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
+        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
@ -41,11 +40,11 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
 }
 static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    size_t n_vocab = probs.size();
+    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
+        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
@ -62,11 +61,11 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
 }
 static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
-    size_t n_vocab = probs.size();
+    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
+        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
@ -81,12 +80,33 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
    }
 }
-static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
+static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    size_t n_vocab = probs.size();
+    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
+        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
    llama_sample_min_p(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);
    llama_sample_softmax(nullptr, &candidates_p);
    GGML_ASSERT(candidates_p.size == expected_probs.size());
    for (size_t i = 0; i < candidates_p.size; i++) {
        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
    }
 }
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
@ -107,11 +127,11 @@ static void test_repetition_penalties(
 ) {
    GGML_ASSERT(probs.size() == expected_probs.size());
-    size_t n_vocab = probs.size();
+    const size_t n_vocab = probs.size();
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        float logit = log(probs[token_id]);
+        const float logit = logf(probs[token_id]);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
@ -128,6 +148,88 @@ static void test_repetition_penalties(
    }
 }
 static void test_sampler_queue(
    const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
 ) {
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
        const float logit = logf(token_id);
        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
          llama_token min_token_id = 0;
    const llama_token max_token_id = n_vocab-1;
    for (auto s : samplers_sequence) {
        switch (s){
            case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
            case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
            case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
            default : GGML_ASSERT(false && "Unknown sampler");                  break;
        }
        llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
        const int size = candidates_p.size;
        if (s == 'k') {
            const int expected_size = std::min(size, top_k);
            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
            GGML_ASSERT(size == expected_size);
            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
        } else if (s == 'p') {
            const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
            const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
                min_token_id  = n_vocab;
            int expected_size = 0;
            int cumsum        = 0;
            do { // do-while because always at least one token is sampled
                min_token_id--;
                expected_size++;
                cumsum += min_token_id;
            } while (cumsum < softmax_numerator_target);
            // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
            if (min_token_id == 1) {
                min_token_id--;
                expected_size += 1;
            }
            GGML_ASSERT(size == expected_size);
            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
        } else if (s == 'm') {
            int expected_size = ceilf((1.0f-min_p) * n_vocab);
            expected_size = std::max(expected_size, 1);
            expected_size = std::min(expected_size, size);
            min_token_id = floorf(min_p * n_vocab);
            min_token_id = std::max(min_token_id, 1);
            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
            GGML_ASSERT(size == expected_size);
            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
        } else {
            GGML_ASSERT(false);
        }
    }
    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 int main(void) {
    ggml_time_init();
@ -139,6 +241,15 @@ int main(void) {
    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.26f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.49f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.51f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.74f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
@ -154,6 +265,34 @@ int main(void) {
    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
    test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
    test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
    test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
    test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
    test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
    test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
    test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
    test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
    test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
    test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
    test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
    test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
    test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
    test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
    printf("OK\n");
    return 0;
--- a/unicode.h
+++ b/unicode.h
@ -2,8 +2,9 @@
 #include <cassert>
 #include <stdexcept>
-#include <vector>
+#include <string>
 #include <unordered_map>
 #include <vector>
 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
`@ -1 +1 @@`
	`6c1ce0bd591a430c1d3f6797d905194581c878c1`	`f2a9472b23cf27e672ed70a2a6eb078f7b060f18`
		`@ -0,0 +1,2 @@`
							`#pragma once`
							`char * get_model_or_exit(int, char*[]);`