Merge remote-tracking branch 'origin/master' into clang_avxvnni_branch

2024-12-31 14:40:22 +01:00 · 2024-12-31 14:40:22 +01:00 · 75be0087c6
commit 75be0087c6
parent 2a4e792ed1 5896c65232
99 changed files with 3712 additions and 2116 deletions
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -0,0 +1,81 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -0,0 +1,94 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 WORKDIR /app
 COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -1,33 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@ -1,33 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -1,50 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,38 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib/ \;
 FROM ubuntu:$UBUNTU_VERSION as runtime
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   /app/requirements.txt
 COPY requirements       /app/requirements
 COPY .devops/tools.sh   /app/tools.sh
 RUN pip install --upgrade pip setuptools wheel && \
    pip install -r /app/requirements.txt
 COPY --from=build /app/build/bin/ /app/
 COPY --from=build /app/lib/ /app/
 COPY --from=build /app/convert_hf_to_gguf.py /app/
 COPY --from=build /app/gguf-py /app/gguf-py
 ENV LC_ALL=C.utf8
 ENTRYPOINT ["/app/tools.sh"]
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -0,0 +1,91 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 ## Build Image
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ### Full
 FROM base AS full
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -1,38 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-cli /
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,28 +0,0 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with static libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@ -1,38 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake
 WORKDIR /app
 COPY . .
 # Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make -j$(nproc) llama-cli
 ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,27 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-cli /llama-cli && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,29 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib/ \;
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/build/bin/llama-cli /app/
 COPY --from=build /app/lib/ /app/
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -1,43 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 # Use the default CUDA archs if not specified
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,34 +0,0 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@ -1,43 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 # Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/lib/ /
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -1,54 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH="\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102"
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 RUN make -j$(nproc) llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,31 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,33 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \
    cmake --build build -j $(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib/ \;
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/build/bin/llama-server /app/
 COPY --from=build /app/lib/ /app/
 ENV LC_ALL=C.utf8
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
 ENV LLAMA_ARG_HOST=0.0.0.0
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -0,0 +1,108 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y \
    build-essential \
    cmake \
    python3 \
    python3-pip \
    git \
    libcurl4-openssl-dev \
    libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -0,0 +1,113 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=6.3
 ARG AMDGPU_VERSION=6.3
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
 #ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
 ARG ROCM_DOCKER_ARCH=gfx1100
 # Set nvcc architectured
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 # ENV CC=/opt/rocm/llvm/bin/clang
 # ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
    cmake \
    git \
    libcurl4-openssl-dev \
    curl \
    libgomp1
 WORKDIR /app
 COPY . .
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
    && cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib \
    && find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3-pip \
    python3 \
    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -0,0 +1,88 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -34,21 +34,14 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Multi-stage build
-          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
@ -56,10 +49,10 @@ jobs:
          fetch-depth: 0 # preserve git history, so we can determine the build number
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
@ -79,25 +72,34 @@ jobs:
          # determine tag name postfix (build number, commit hash)
          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="b${BUILD_NUMBER}"
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
          else
            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
          fi
          # list all tags possible
-          TAGS=""
+          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
+              TYPE=""
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
+          else
-
+              TYPE="-${{ matrix.config.tag }}"
-          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
+          fi
-          echo "output_tags=$TAGS"  # print out for debugging
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
          echo "full_output_tags=$FULLTAGS"  # print out for debugging
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        if: ${{ matrix.config.free_disk_space == true }}
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
@ -113,13 +115,59 @@ jobs:
          docker-images: true
          swap-storage: true
-      - name: Build and push Docker image (tagged + versioned)
+      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.output_tags }}
+          tags: ${{ steps.tag.outputs.full_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.light_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.server_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -626,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.ctx_shift = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--chunks"}, "N",
        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -2206,5 +2206,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
    // model-specific
    add_opt(common_arg(
        {"--tts-oute-default"},
        string_format("use default OuteTTS models (note: can download weights from the internet)"),
        [](common_params & params) {
            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));
    return ctx_arg;
 }
--- a/common/common.cpp
+++ b/common/common.cpp
@ -18,6 +18,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@ -62,7 +63,9 @@
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
 #   define PATH_MAX MAX_PATH
 #   endif
 #else
 #include <sys/syslimits.h>
 #endif
@ -1148,8 +1151,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
    // Check if the file already exists locally
-    struct stat model_file_info;
+    auto file_exists = std::filesystem::exists(path);
    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
    // If the file exists, check its JSON metadata companion file.
    std::string metadata_path = path + ".json";
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -529,9 +529,19 @@ class Model:
            else:
                token: str = reverse_vocab[i]
                if token in added_vocab:
                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
                    if not tokenizer.added_tokens_decoder[i].normalized:
                        previous_token = token
                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
                        if previous_token != token:
                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
                    if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                        toktypes.append(gguf.TokenType.CONTROL)
                    else:
                        # NOTE: this was added for Gemma.
                        # Encoding and decoding the tokens above isn't sufficient for this case.
                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
                        toktypes.append(gguf.TokenType.USER_DEFINED)
                else:
@ -575,6 +585,9 @@ class Model:
        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
            res = "falcon3"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
@ -671,6 +684,9 @@ class Model:
        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
            res = "gigachat"
        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
            res = "megrez"
        if res is None:
            logger.warning("\n")
@ -1679,6 +1695,178 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeciLMForCausalLM")
 class DeciModel(Model):
    model_arch = gguf.MODEL_ARCH.DECI
    @staticmethod
    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
        # DeciLM-specific code
        intermediate_size = int(2 * ffn_mult * n_embd / 3)
        return DeciModel._find_multiple(intermediate_size, 256)
    @staticmethod
    def _find_multiple(n: int, k: int) -> int:
        # DeciLM-specific code
        if n % k == 0:
            return n
        return n + k - (n % k)
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
            _block_configs: list[dict[str,Any]] = self.hparams["block_configs"]
            assert self.block_count == len(_block_configs)
            self._num_kv_heads = list()
            self._num_heads = list()
            _ffn_multipliers = list()
            # ***linear attention layer***
            # if n_heads_in_group is None and replace_with_linear is True
            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
            # ***attention-free layer***
            # if n_heads_in_group is None and replace_with_linear is False
            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
            # ***normal attention-layer***
            # if n_heads_in_group is not None, then
            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
            # _num_heads[il] is num_attention_head
            for il in range(len(_block_configs)):
                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
                        self._num_kv_heads.append(0)
                        self._num_heads.append(self.hparams["num_attention_heads"])
                    else:
                        self._num_kv_heads.append(0)
                        self._num_heads.append(0)
                else:
                    self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
                    self._num_heads.append(self.hparams["num_attention_heads"])
                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
            assert self.block_count == len(self._num_kv_heads)
            assert self.block_count == len(self._num_heads)
            assert self.block_count == len(_ffn_multipliers)
            assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
            assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int)
            assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float)
            self._ffn_dims: list[int] = [
                DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"])
                for multiplier in _ffn_multipliers
            ]
    def set_vocab(self):
        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
        # eos_token from '|eot_id|' to '|end_of_text|'
        if self.hparams.get("vocab_size", 128256) == 128256:
            tokens, toktypes, tokpre = self.get_vocab_base()
            self.gguf_writer.add_tokenizer_model("gpt2")
            self.gguf_writer.add_tokenizer_pre(tokpre)
            self.gguf_writer.add_token_list(tokens)
            self.gguf_writer.add_token_types(toktypes)
            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
            special_vocab.add_to_gguf(self.gguf_writer)
        else:
            # DeciLM-7B
            self._set_vocab_llama_hf()
    def set_gguf_parameters(self):
        if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B
            assert self.block_count == len(self._num_kv_heads)
            assert self.block_count == len(self._num_heads)
            assert self.block_count == len(self._ffn_dims)
            if (rope_theta := self.hparams.get("rope_theta")) is not None:
                self.gguf_writer.add_rope_freq_base(rope_theta)
            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
            self.gguf_writer.add_head_count(self._num_heads)
            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
            self.gguf_writer.add_block_count(self.block_count)
            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
            self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
            self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
            self.gguf_writer.add_file_type(self.ftype)
        else: # DeciLM-7B
            super().set_gguf_parameters()
            if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B
                self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"]
                assert self.block_count == len(self._num_kv_heads)
                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
        hparams = self.hparams
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        if "head_dim" in hparams:
            rope_dim = hparams["head_dim"]
        else:
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
            n_head = n_head_kv
        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                .swapaxes(1, 2)
                .reshape(weights.shape))
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams["num_attention_heads"]
        if bid is not None:
            if "num_key_value_heads_per_layer" in self.hparams:
                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
            elif "block_configs" in self.hparams:
                n_kv_head = self._num_kv_heads[bid]
                n_head = self._num_heads[bid]
            else:
                n_kv_head = self.hparams.get("num_key_value_heads")
        else:
            n_kv_head = self.hparams.get("num_key_value_heads")
        if name.endswith(("q_proj.weight", "q_proj.bias")):
            data_torch = DeciModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight", "k_proj.bias")):
            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
        return [(self.map_tensor_name(name), data_torch)]
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
                dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
                factor = rope_scaling.get("factor", 8.0)
                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
                assert low_freq_wavelen != high_freq_wavelen
                rope_factors = []
                for freq in freqs:
                    wavelen = 2 * math.pi / freq
                    if wavelen < high_freq_wavelen:
                        rope_factors.append(1)
                    elif wavelen > low_freq_wavelen:
                        rope_factors.append(factor)
                    else:
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
    def prepare_tensors(self):
        super().prepare_tensors()
@Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
    model_arch = gguf.MODEL_ARCH.BITNET
@ -2200,6 +2388,15 @@ class Phi3MiniModel(Model):
    model_arch = gguf.MODEL_ARCH.PHI3
    def set_vocab(self):
        # Phi-4 model uses GPT2Tokenizer
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                tokenizer_class = tokenizer_config_json['tokenizer_class']
                if tokenizer_class == 'GPT2Tokenizer':
                    return self._set_vocab_gpt2()
        from sentencepiece import SentencePieceProcessor
        tokenizer_path = self.dir_model / 'tokenizer.model'
@ -2316,7 +2513,11 @@ class Phi3MiniModel(Model):
        self.gguf_writer.add_rope_dimension_count(rope_dims)
        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
+        sliding_window = self.hparams.get("sliding_window")
        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
        if sliding_window is None:
            sliding_window = 0
        self.gguf_writer.add_sliding_window(sliding_window)
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        n_embd = self.find_hparam(["hidden_size", "n_embd"])
@ -2615,7 +2816,7 @@ class InternLM2Model(Model):
            return [(self.map_tensor_name(name), data_torch)]
-@Model.register("BertModel", "CamembertModel", "RobertaModel")
+@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT
@ -2681,13 +2882,73 @@ class BertModel(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        if name.startswith("bert."):
            name = name[5:]
        if name.endswith(".gamma"):
            name = name[:-6] + ".weight"
        if name.endswith(".beta"):
            name = name[:-5] + ".bias"
        # we are only using BERT for embeddings so we don't need the pooling layer
        if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
            return [] # we don't need these
        if name.startswith("cls.predictions"):
            return []
        if name.startswith("cls.seq_relationship"):
            return []
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("RobertaModel")
 class RobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # we need the pad_token_id to know how to chop down position_embd matrix
        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
            self._position_offset = 1 + pad_token_id
            if "max_position_embeddings" in self.hparams:
                self.hparams["max_position_embeddings"] -= self._position_offset
        else:
            self._position_offset = None
    def set_vocab(self):
        """Support BPE tokenizers for roberta models"""
        bpe_tok_path = self.dir_model / "tokenizer.json"
        if bpe_tok_path.exists():
            self._set_vocab_gpt2()
            self.gguf_writer.add_add_bos_token(True)
            self.gguf_writer.add_add_eos_token(True)
            # we need this to validate the size of the token_type embeddings
            # though currently we are passing all zeros to the token_type embeddings
            # "Sequence A" or "Sequence B"
            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
        else:
            return super().set_vocab()
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # if name starts with "roberta.", remove the prefix
        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
        if name.startswith("roberta."):
            name = name[8:]
        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
        if name == "embeddings.position_embeddings.weight":
            if self._position_offset is not None:
                data_torch = data_torch[self._position_offset:,:]
        return super().modify_tensors(data_torch, name, bid)
@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@ -3007,6 +3268,9 @@ class Rwkv6Model(Model):
        if new_name.endswith("time_mix_w2.weight"):
            data_torch = data_torch.permute(0, 2, 1)
        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
            data_torch = data_torch.squeeze()
        rescale_every_n_layers = self.hparams["rescale_every"]
        if rescale_every_n_layers > 0:
            if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -72,6 +72,7 @@ models = [
    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
    {"name": "falcon3",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
    {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
@ -105,6 +106,7 @@ models = [
    {"name": "minerva-7b",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
    {"name": "roberta-bpe",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
    {"name": "gigachat",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
    {"name": "megrez",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
 ]
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@ -15,7 +15,7 @@ static void run(
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
        // calculate mean vector
        struct ggml_tensor * t_layer = v_input[il];
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -302,7 +302,7 @@ static void run_pca(
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+        ggml_format_name(ctrl_out, "direction.%zu", il+1);
        // run power_iteration
        params.i_layer = il;
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -265,8 +265,8 @@ struct lora_merge_ctx {
            fout.write((const char *)data.data(), data.size());
        }
-        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
+        printf("%s : wrote %zu tensors to output file\n", __func__, trans.size());
    }
    void copy_tensor(struct ggml_tensor * base) {
@ -352,7 +352,7 @@ struct lora_merge_ctx {
                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
                delta = ggml_scale(ctx0, delta, scale);
                cur = ggml_add(ctx0, delta, cur);
-                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :   + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
            }
            cur = ggml_cast(ctx0, cur, out->type);
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -11,19 +11,15 @@
 static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
    const auto cpts = unicode_cpts_from_utf8(input_str);
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
    size_t pos = 0;
    for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+        llama_grammar_accept(grammar, cpt);
        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
        if (stacks_cur.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
            stacks_cur = stacks_prev;
            return false;
        }
        ++pos;
@ -82,7 +78,8 @@ int main(int argc, char** argv) {
    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
    if (grammar == nullptr) {
-        throw std::runtime_error("Failed to initialize llama_grammar");
+        fprintf(stdout, "Failed to initialize llama_grammar\n");
        return 1;
    }
    // Read the input file
    std::string input_str;
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -305,7 +305,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+    //llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    delete batch;
 }
 extern "C"
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -8,25 +8,25 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#ifdef GGML_USE_CUDA
+//#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
+//#include "ggml-cuda.h"
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_SYCL
+//#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
+//#include "ggml-sycl.h"
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_METAL
+//#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
+//#include "ggml-metal.h"
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_CANN
+//#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
+//#include "ggml-cann.h"
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_VULKAN
+//#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
+//#include "ggml-vulkan.h"
-#endif
+//#endif
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@ -1222,30 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }
-#ifdef GGML_USE_CUDA
+//#ifdef GGML_USE_CUDA
-    new_clip->backend = ggml_backend_cuda_init(0);
+//    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+//    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_METAL
+//#ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
+//    new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+//    LOG_INF("%s: CLIP using Metal backend\n", __func__);
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_CANN
+//#ifdef GGML_USE_CANN
-    new_clip->backend = ggml_backend_cann_init(0);
+//    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+//    LOG_INF("%s: CLIP using CANN backend\n", __func__);
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_VULKAN
+//#ifdef GGML_USE_VULKAN
-    new_clip->backend = ggml_backend_vk_init(0);
+//    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+//    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
-#endif
+//#endif
-
+//
-#ifdef GGML_USE_SYCL
+//#ifdef GGML_USE_SYCL
-    new_clip->backend = ggml_backend_sycl_init(0);
+//    new_clip->backend = ggml_backend_sycl_init(0);
-    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+//    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
-#endif
+//#endif
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -12,6 +12,10 @@
 #include "ggml-vulkan.h"
 #endif
 #ifdef GGML_USE_SYCL
 #include "ggml-sycl.h"
 #endif
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
    }
 #elif GGML_USE_SYCL
    fprintf(stderr, "%s: using SYCL backend\n", __func__);
    backend = ggml_backend_sycl_init(0); // init device 0
    if (!backend) {
        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
    }
 #endif
    // if there aren't GPU Backends fallback to CPU backend
@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_VULKAN
    ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
 #elif GGML_USE_SYCL
    ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
 #else
    #ifdef _WIN32
        MEMORYSTATUSEX status;
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -19,6 +19,8 @@ Options:
      Context size (default: 2048)
  -n, --ngl <value>
      Number of GPU layers (default: 0)
  --temp <value>
      Temperature (default: 0.8)
  -v, --verbose, --log-verbose
      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
  -h, --help
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -1,5 +1,6 @@
 #if defined(_WIN32)
 #    include <windows.h>
 #    include <io.h>
 #else
 #    include <sys/file.h>
 #    include <sys/ioctl.h>
@ -55,29 +56,51 @@ static int printe(const char * fmt, ...) {
 class Opt {
  public:
    int init(int argc, const char ** argv) {
        ctx_params           = llama_context_default_params();
        model_params         = llama_model_default_params();
        context_size_default = ctx_params.n_batch;
        ngl_default          = model_params.n_gpu_layers;
        common_params_sampling sampling;
        temperature_default = sampling.temp;
        if (argc < 2) {
            printe("Error: No arguments provided.\n");
            print_help();
            return 1;
        }
        // Parse arguments
        if (parse(argc, argv)) {
            printe("Error: Failed to parse arguments.\n");
-            help();
+            print_help();
            return 1;
        }
        // If help is requested, show help and exit
-        if (help_) {
+        if (help) {
-            help();
+            print_help();
            return 2;
        }
        ctx_params.n_batch        = context_size >= 0 ? context_size : context_size_default;
        model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default;
        temperature               = temperature >= 0 ? temperature : temperature_default;
        return 0;  // Success
    }
    llama_context_params ctx_params;
    llama_model_params   model_params;
    std::string model_;
-    std::string user_;
+    std::string          user;
-    int         context_size_ = -1, ngl_ = -1;
+    int                  context_size = -1, ngl = -1;
-    bool        verbose_ = false;
+    float                temperature = -1;
    bool                 verbose     = false;
  private:
-    bool        help_ = false;
+    int   context_size_default = -1, ngl_default = -1;
    float temperature_default = -1;
    bool  help                = false;
    bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
        return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
@ -89,6 +112,17 @@ class Opt {
        }
        option_value = std::atoi(argv[++i]);
        return 0;
    }
    int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) {
        if (i + 1 >= argc) {
            return 1;
        }
        option_value = std::atof(argv[++i]);
        return 0;
    }
@ -96,18 +130,22 @@ class Opt {
        bool options_parsing   = true;
        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
            if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
-                if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
+                if (handle_option_with_value(argc, argv, i, context_size) == 1) {
                    return 1;
                }
            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
-                if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
+                if (handle_option_with_value(argc, argv, i, ngl) == 1) {
                    return 1;
                }
            } else if (options_parsing && strcmp(argv[i], "--temp") == 0) {
                if (handle_option_with_value(argc, argv, i, temperature) == 1) {
                    return 1;
                }
            } else if (options_parsing &&
                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
-                verbose_ = true;
+                verbose = true;
            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
-                help_ = true;
+                help = true;
                return 0;
            } else if (options_parsing && strcmp(argv[i], "--") == 0) {
                options_parsing = false;
@ -120,16 +158,16 @@ class Opt {
                model_ = argv[i];
            } else if (positional_args_i == 1) {
                ++positional_args_i;
-                user_ = argv[i];
+                user = argv[i];
            } else {
-                user_ += " " + std::string(argv[i]);
+                user += " " + std::string(argv[i]);
            }
        }
        return 0;
    }
-    void help() const {
+    void print_help() const {
        printf(
            "Description:\n"
            "  Runs a llm\n"
@ -142,6 +180,8 @@ class Opt {
            "      Context size (default: %d)\n"
            "  -n, --ngl <value>\n"
            "      Number of GPU layers (default: %d)\n"
            "  --temp <value>\n"
            "      Temperature (default: %.1f)\n"
            "  -v, --verbose, --log-verbose\n"
            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
            "  -h, --help\n"
@ -170,7 +210,7 @@ class Opt {
            "  llama-run file://some-file3.gguf\n"
            "  llama-run --ngl 999 some-file4.gguf\n"
            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
-            llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
+            context_size_default, ngl_default, temperature_default);
    }
 };
@ -214,7 +254,7 @@ class File {
                return 1;
            }
-            OVERLAPPED overlapped = { 0 };
+            OVERLAPPED overlapped = {};
            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
                            &overlapped)) {
                fd = -1;
@ -238,7 +278,7 @@ class File {
        if (fd >= 0) {
 #    ifdef _WIN32
            if (hFile != INVALID_HANDLE_VALUE) {
-                OVERLAPPED overlapped = { 0 };
+                OVERLAPPED overlapped = {};
                UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
            }
 #    else
@ -254,7 +294,7 @@ class File {
  private:
    int fd = -1;
 #    ifdef _WIN32
-    HANDLE hFile;
+    HANDLE hFile = nullptr;
 #    endif
 };
@ -425,7 +465,7 @@ class HttpClient {
        return (now_downloaded_plus_file_size * 100) / total_to_download;
    }
-    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
+    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
        const auto                          now             = std::chrono::steady_clock::now();
@ -495,12 +535,12 @@ class LlamaData {
            return 1;
        }
-        context = initialize_context(model, opt.context_size_);
+        context = initialize_context(model, opt);
        if (!context) {
            return 1;
        }
-        sampler = initialize_sampler();
+        sampler = initialize_sampler(opt);
        return 0;
    }
@ -619,14 +659,12 @@ class LlamaData {
    // Initializes the model and returns a unique pointer to it
    llama_model_ptr initialize_model(Opt & opt) {
        ggml_backend_load_all();
        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers       = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
        resolve_model(opt.model_);
        printe(
            "\r%*s"
            "\rLoading model",
            get_terminal_width(), " ");
-        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
+        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }
@ -636,10 +674,8 @@ class LlamaData {
    }
    // Initializes the context with the specified parameters
-    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
-        llama_context_params ctx_params = llama_context_default_params();
+        llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params));
        ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
            printe("%s: error: failed to create the llama_context\n", __func__);
        }
@ -648,10 +684,10 @@ class LlamaData {
    }
    // Initializes and configures the sampler
-    llama_sampler_ptr initialize_sampler() {
+    llama_sampler_ptr initialize_sampler(const Opt & opt) {
        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
-        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
        return sampler;
@ -798,9 +834,9 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const
 }
 // Helper function to handle user input
-static int handle_user_input(std::string & user_input, const std::string & user_) {
+static int handle_user_input(std::string & user_input, const std::string & user) {
-    if (!user_.empty()) {
+    if (!user.empty()) {
-        user_input = user_;
+        user_input = user;
        return 0;  // No need for interactive input
    }
@ -832,17 +868,17 @@ static bool is_stdout_a_terminal() {
 }
 // Function to tokenize the prompt
-static int chat_loop(LlamaData & llama_data, const std::string & user_) {
+static int chat_loop(LlamaData & llama_data, const std::string & user) {
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
        std::string user_input;
-        while (handle_user_input(user_input, user_)) {
+        while (handle_user_input(user_input, user)) {
        }
-        add_message("user", user_.empty() ? user_input : user_, llama_data);
+        add_message("user", user.empty() ? user_input : user, llama_data);
        int new_len;
        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
            return 1;
@ -854,7 +890,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
            return 1;
        }
-        if (!user_.empty()) {
+        if (!user.empty()) {
            break;
        }
@ -869,7 +905,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
 static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
    const Opt * opt = static_cast<Opt *>(p);
-    if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
+    if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) {
        printe("%s", text);
    }
 }
@ -890,11 +926,11 @@ int main(int argc, const char ** argv) {
    }
    if (!is_stdin_a_terminal()) {
-        if (!opt.user_.empty()) {
+        if (!opt.user.empty()) {
-            opt.user_ += "\n\n";
+            opt.user += "\n\n";
        }
-        opt.user_ += read_pipe_data();
+        opt.user += read_pipe_data();
    }
    llama_log_set(log_callback, &opt);
@ -903,7 +939,7 @@ int main(int argc, const char ** argv) {
        return 1;
    }
-    if (chat_loop(llama_data, opt.user_)) {
+    if (chat_loop(llama_data, opt.user)) {
        return 1;
    }
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -34,6 +34,7 @@ endforeach()
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -343,6 +343,10 @@ node index.js
 ### POST `/completion`: Given a `prompt`, it returns the predicted completion.
 > [!IMPORTANT]
 >
 > This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead.
 *Options:*
 `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
@ -444,38 +448,70 @@ These words will not be included in the completion, so make sure to add them to
 `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
 `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
 `response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name.
 **Response format**
 - Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
+- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements:
-
+  ```json
-```json
+  {
-{
+    "content": "<the generated completion text>",
  "content": "<the token generated by the model>",
    "tokens": [ generated token ids if requested ],
    ...
    "probs": [
      {
-      "prob": float,
+        "id": <token id>,
-      "tok_str": "<most likely token>"
+        "logprob": float,
        "token": "<most likely token>",
        "bytes": [int, int, ...],
        "top_logprobs": [
          {
            "id": <token id>,
            "logprob": float,
            "token": "<token text>",
            "bytes": [int, int, ...],
          },
          {
-      "prob": float,
+            "id": <token id>,
-      "tok_str": "<second most likely token>"
+            "logprob": float,
            "token": "<token text>",
            "bytes": [int, int, ...],
          },
          ...
        ]
-},
+      },
-```
+      {
-
+        "id": <token id>,
-Notice that each `probs` is an array of length `n_probs`.
+        "logprob": float,
        "token": "<most likely token>",
        "bytes": [int, int, ...],
        "top_logprobs": [
          ...
        ]
      },
      ...
    ]
  },
  ```
  Please note that if `post_sampling_probs` is set to `true`:
    - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0
    - `top_logprobs` will be replaced with `top_probs`. Each element contains:
      - `id`: token ID
      - `token`: token in string
      - `bytes`: token in bytes
      - `prob`: token probability, with the value between 0.0 and 1.0
    - Number of elements in `top_probs` may be less than `n_probs`
 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The path to the model loaded with `-m`
+- `model`: The model alias (for model path, please use `/props` endpoint)
- `prompt`: The provided `prompt`
+- `prompt`: The processed `prompt` (special tokens may be added)
 - `stop_type`: Indicating whether the completion has stopped. Possible values are:
  - `none`: Generating (not stopped)
  - `eos`: Stopped because it encountered the EOS token
@ -487,6 +523,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 ### POST `/tokenize`: Tokenize a given text
 *Options:*
@ -538,6 +575,10 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
 ### POST `/embedding`: Generate embedding of a given text
 > [!IMPORTANT]
 >
 > This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead.
 The same as [the embedding example](../embedding) does.
 *Options:*
@ -690,7 +731,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make
  },
  "total_slots": 1,
  "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
-  "chat_template": "..."
+  "chat_template": "...",
  "build_info": "b(build number)-(build commit hash)"
 }
 ```
@ -707,96 +749,6 @@ To use this endpoint with POST method, you need to start server with `--props`
 - None yet
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 *Options:*
 See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
 The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
 *Examples:*
 You can use either Python `openai` library with appropriate checkpoints:
 ```python
 import openai
 client = openai.OpenAI(
    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
 )
 completion = client.chat.completions.create(
 model="gpt-3.5-turbo",
 messages=[
    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
    {"role": "user", "content": "Write a limerick about python exceptions"}
 ]
 )
 print(completion.choices[0].message)
 ```
 ... or raw HTTP requests:
 ```shell
 curl http://localhost:8080/v1/chat/completions \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer no-key" \
 -d '{
 "model": "gpt-3.5-turbo",
 "messages": [
 {
    "role": "system",
    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
 },
 {
    "role": "user",
    "content": "Write a limerick about python exceptions"
 }
 ]
 }'
 ```
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
 This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
 *Options:*
 See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
 *Examples:*
 - input as string
  ```shell
  curl http://localhost:8080/v1/embeddings \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer no-key" \
  -d '{
          "input": "hello",
          "model":"GPT-4",
          "encoding_format": "float"
  }'
  ```
 - `input` as string array
  ```shell
  curl http://localhost:8080/v1/embeddings \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer no-key" \
  -d '{
          "input": ["hello", "world"],
          "model":"GPT-4",
          "encoding_format": "float"
  }'
  ```
 ### POST `/embeddings`: non-OpenAI-compatible embeddings API
 This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
@ -1027,6 +979,161 @@ To know the `id` of the adapter, use GET `/lora-adapters`
 ]
 ```
 ## OpenAI-compatible API Endpoints
 ### GET `/v1/models`: OpenAI-compatible Model Info API
 Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).
 The returned list always has one single element.
 By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.
 Example:
 ```json
 {
    "object": "list",
    "data": [
        {
            "id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
            "object": "model",
            "created": 1735142223,
            "owned_by": "llamacpp",
            "meta": {
                "vocab_type": 2,
                "n_vocab": 128256,
                "n_ctx_train": 131072,
                "n_embd": 4096,
                "n_params": 8030261312,
                "size": 4912898304
            }
        }
    ]
 }
 ```
 ### POST `/v1/completions`: OpenAI-compatible Completions API
 Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps.
 *Options:*
 See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions).
 llama.cpp `/completion`-specific features such as `mirostat` are supported.
 *Examples:*
 Example usage with `openai` python library:
 ```python
 import openai
 client = openai.OpenAI(
    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
 )
 completion = client.completions.create(
  model="davinci-002",
  prompt="I believe the meaning of life is",
  max_tokens=8
 )
 print(completion.choices[0].text)
 ```
 ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
 *Options:*
 See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
 The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
 *Examples:*
 You can use either Python `openai` library with appropriate checkpoints:
 ```python
 import openai
 client = openai.OpenAI(
    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
 )
 completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
    {"role": "user", "content": "Write a limerick about python exceptions"}
  ]
 )
 print(completion.choices[0].message)
 ```
 ... or raw HTTP requests:
 ```shell
 curl http://localhost:8080/v1/chat/completions \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer no-key" \
 -d '{
 "model": "gpt-3.5-turbo",
 "messages": [
 {
    "role": "system",
    "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
 },
 {
    "role": "user",
    "content": "Write a limerick about python exceptions"
 }
 ]
 }'
 ```
 ### POST `/v1/embeddings`: OpenAI-compatible embeddings API
 This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
 *Options:*
 See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
 *Examples:*
 - input as string
  ```shell
  curl http://localhost:8080/v1/embeddings \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer no-key" \
  -d '{
          "input": "hello",
          "model":"GPT-4",
          "encoding_format": "float"
  }'
  ```
 - `input` as string array
  ```shell
  curl http://localhost:8080/v1/embeddings \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer no-key" \
  -d '{
          "input": ["hello", "world"],
          "model":"GPT-4",
          "encoding_format": "float"
  }'
  ```
 ## More examples
 ### Interactive mode
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -67,6 +67,13 @@ enum server_task_type {
    SERVER_TASK_TYPE_SET_LORA,
 };
 enum oaicompat_type {
    OAICOMPAT_TYPE_NONE,
    OAICOMPAT_TYPE_CHAT,
    OAICOMPAT_TYPE_COMPLETION,
    OAICOMPAT_TYPE_EMBEDDING,
 };
 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
    ERROR_TYPE_INVALID_REQUEST,
@ -92,7 +99,9 @@ struct slot_params {
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    std::vector<std::string> response_fields;
    bool timings_per_token = false;
    bool post_sampling_probs = false;
    bool ignore_eos = false;
    struct common_params_sampling sampling;
@ -100,8 +109,7 @@ struct slot_params {
    // OAI-compat fields
    bool           verbose        = false;
-    bool        oaicompat      = false;
+    oaicompat_type oaicompat      = OAICOMPAT_TYPE_NONE;
    bool        oaicompat_chat = true;
    std::string    oaicompat_model;
    std::string    oaicompat_cmpl_id;
@ -151,6 +159,7 @@ struct slot_params {
            {"speculative.n_min",         speculative.n_min},
            {"speculative.p_min",         speculative.p_min},
            {"timings_per_token",         timings_per_token},
            {"post_sampling_probs",       post_sampling_probs},
        };
    }
 };
@ -207,6 +216,7 @@ struct server_task {
        params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
      //params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
        params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
        params.response_fields  = json_value(data, "response_fields",   std::vector<std::string>());
        params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
        params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
@ -231,6 +241,7 @@ struct server_task {
        params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
        params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
        params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
        params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
        params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
        params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
@ -436,36 +447,67 @@ inline std::string stop_type_to_str(stop_type type) {
 struct completion_token_output {
    llama_token tok;
    float prob;
    std::string text_to_send;
-    struct token_prob {
+    struct prob_info {
        llama_token tok;
-        std::string tok_str;
+        std::string txt;
        float prob;
    };
-    std::vector<token_prob> probs;
+    std::vector<prob_info> probs;
-    json to_json() const {
+    json to_json(bool post_sampling_probs) const {
        json probs_for_token = json::array();
        for (const auto & p : probs) {
            std::string txt(p.txt);
            txt.resize(validate_utf8(txt));
            probs_for_token.push_back(json {
-                {"tok_str", p.tok_str},
+                {"id",      p.tok},
-                {"prob",    p.prob},
+                {"token",   txt},
                {"bytes",   str_to_bytes(p.txt)},
                {
                    post_sampling_probs ? "prob" : "logprob",
                    post_sampling_probs ? p.prob : logarithm(p.prob)
                },
            });
        }
        return probs_for_token;
    }
-    static json probs_vector_to_json(const std::vector<completion_token_output> & probs) {
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
        json out = json::array();
-        for (const auto & prob : probs) {
+        for (const auto & p : probs) {
-            const std::string tok_str = prob.text_to_send;
+            std::string txt(p.text_to_send);
            txt.resize(validate_utf8(txt));
            out.push_back(json {
-                {"content", tok_str},
+                {"id",           p.tok},
-                {"probs",   prob.to_json()},
+                {"token",        txt},
                {"bytes",        str_to_bytes(p.text_to_send)},
                {
                    post_sampling_probs ? "prob" : "logprob",
                    post_sampling_probs ? p.prob : logarithm(p.prob)
                },
                {
                    post_sampling_probs ? "top_probs" : "top_logprobs",
                    p.to_json(post_sampling_probs)
                },
            });
        }
        return out;
    }
    static float logarithm(float x) {
        // nlohmann::json converts -inf to null, so we need to prevent that
        return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
    }
    static std::vector<unsigned char> str_to_bytes(const std::string & str) {
        std::vector<unsigned char> bytes;
        for (unsigned char c : str) {
            bytes.push_back(c);
        }
        return bytes;
    }
 };
 struct server_task_result_cmpl_final : server_task_result {
@ -486,14 +528,15 @@ struct server_task_result_cmpl_final : server_task_result {
    std::string stopping_word;
    stop_type stop = STOP_TYPE_NONE;
    bool post_sampling_probs;
    std::vector<completion_token_output> probs_output;
    std::vector<std::string>  response_fields;
    slot_params generation_params;
    // OAI-compat fields
    bool           verbose        = false;
-    bool        oaicompat      = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
    bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat
    std::string    oaicompat_model;
    std::string    oaicompat_cmpl_id;
@ -506,9 +549,16 @@ struct server_task_result_cmpl_final : server_task_result {
    }
    virtual json to_json() override {
-        return oaicompat
+        switch (oaicompat) {
-            ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat())
+            case OAICOMPAT_TYPE_NONE:
-            : to_json_non_oaicompat();
+                return to_json_non_oaicompat();
            case OAICOMPAT_TYPE_COMPLETION:
                return to_json_oaicompat();
            case OAICOMPAT_TYPE_CHAT:
                return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
            default:
                GGML_ASSERT(false && "Invalid oaicompat_type");
        }
    }
    json to_json_non_oaicompat() {
@ -530,9 +580,53 @@ struct server_task_result_cmpl_final : server_task_result {
            {"tokens_cached",       n_tokens_cached},
            {"timings",             timings.to_json()},
        };
-        if (!probs_output.empty()) {
+        if (!stream && !probs_output.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
        }
        return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
    }
    json to_json_oaicompat() {
        std::time_t t = std::time(0);
        json logprobs = json(nullptr); // OAI default to null
        if (!stream && probs_output.size() > 0) {
            logprobs = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
            };
        }
        json finish_reason = "length";
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        }
        json res = json {
            {"choices",            json::array({
                json{
                    {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
                    {"index",         index},
                    {"logprobs",      logprobs},
                    {"finish_reason", finish_reason},
                }
            })},
            {"created",            t},
            {"model",              oaicompat_model},
            {"system_fingerprint", build_info},
            {"object",             "text_completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id", oaicompat_cmpl_id}
        };
        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json_non_oaicompat();
        }
        if (timings.prompt_n >= 0) {
            res.push_back({"timings", timings.to_json()});
        }
        return res;
    }
@ -542,21 +636,28 @@ struct server_task_result_cmpl_final : server_task_result {
            finish_reason = "stop";
        }
-        json choices = json::array({json{
+        json choice = json{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"message", json {
                {"content", content},
                {"role",    "assistant"}
            }
-        }}});
+        }};
        if (!stream && probs_output.size() > 0) {
            choice["logprobs"] = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
            };
        }
        std::time_t t = std::time(0);
        json res = json {
-            {"choices", choices},
+            {"choices",            json::array({choice})},
            {"created",            t},
            {"model",              oaicompat_model},
            {"system_fingerprint", build_info},
            {"object",             "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
@ -584,15 +685,18 @@ struct server_task_result_cmpl_final : server_task_result {
            finish_reason = "stop";
        }
-        json choices = json::array({json{{"finish_reason", finish_reason},
+        json choice = json{
            {"finish_reason", finish_reason},
            {"index", 0},
-                                        {"delta", json::object()}}});
+            {"delta", json::object()}
        };
        json ret = json {
-            {"choices", choices},
+            {"choices",            json::array({choice})},
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"},
            {"usage", json {
                {"completion_tokens", n_decoded},
@ -618,13 +722,13 @@ struct server_task_result_cmpl_partial : server_task_result {
    int32_t n_decoded;
    int32_t n_prompt_tokens;
-    std::vector<completion_token_output> probs_output;
+    bool post_sampling_probs;
    completion_token_output prob_output;
    result_timings timings;
    // OAI-compat fields
    bool           verbose   = false;
-    bool        oaicompat      = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
    bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat
    std::string    oaicompat_model;
    std::string    oaicompat_cmpl_id;
@ -637,7 +741,16 @@ struct server_task_result_cmpl_partial : server_task_result {
    }
    virtual json to_json() override {
-        return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
+        switch (oaicompat) {
            case OAICOMPAT_TYPE_NONE:
                return to_json_non_oaicompat();
            case OAICOMPAT_TYPE_COMPLETION:
                return to_json_oaicompat();
            case OAICOMPAT_TYPE_CHAT:
                return to_json_oaicompat_chat();
            default:
                GGML_ASSERT(false && "Invalid oaicompat_type");
        }
    }
    json to_json_non_oaicompat() {
@ -655,13 +768,48 @@ struct server_task_result_cmpl_partial : server_task_result {
        if (timings.prompt_n > 0) {
            res.push_back({"timings", timings.to_json()});
        }
-        if (!probs_output.empty()) {
+        if (!prob_output.probs.empty()) {
-            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
        }
        return res;
    }
    json to_json_oaicompat() {
        std::time_t t = std::time(0);
        json logprobs = json(nullptr); // OAI default to null
        if (prob_output.probs.size() > 0) {
            logprobs = json{
                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
            };
        }
        json res = json {
            {"choices",            json::array({
                json{
                    {"text",          content},
                    {"index",         index},
                    {"logprobs",      logprobs},
                    {"finish_reason", nullptr},
                }
            })},
            {"created",            t},
            {"model",              oaicompat_model},
            {"system_fingerprint", build_info},
            {"object",             "text_completion"},
            {"id",                 oaicompat_cmpl_id}
        };
        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json_non_oaicompat();
        }
        if (timings.prompt_n >= 0) {
            res.push_back({"timings", timings.to_json()});
        }
        return res;
    }
    json to_json_oaicompat_chat() {
        bool first = n_decoded == 0;
        std::time_t t = std::time(0);
        json choices;
@ -708,11 +856,20 @@ struct server_task_result_cmpl_partial : server_task_result {
            }});
        }
        GGML_ASSERT(choices.size() >= 1);
        if (prob_output.probs.size() > 0) {
            choices[0]["logprobs"] = json{
                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
            };
        }
        json ret = json {
            {"choices",            choices},
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
            {"system_fingerprint", build_info},
            {"object",             "chat.completion.chunk"}
        };
@ -731,14 +888,16 @@ struct server_task_result_embd : server_task_result {
    int32_t n_tokens;
    // OAI-compat fields
-    bool oaicompat = false;
+    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
    virtual int get_index() override {
        return index;
    }
    virtual json to_json() override {
-        return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat();
+        return oaicompat == OAICOMPAT_TYPE_EMBEDDING
            ? to_json_oaicompat()
            : to_json_non_oaicompat();
    }
    json to_json_non_oaicompat() {
@ -1001,7 +1160,6 @@ struct server_slot {
    // stats
    size_t n_sent_text        = 0; // number of sent text character
    size_t n_sent_token_probs = 0;
    int64_t t_start_process_prompt;
    int64_t t_start_generation;
@ -1023,7 +1181,6 @@ struct server_slot {
        stopping_word      = "";
        n_past             = 0;
        n_sent_text        = 0;
        n_sent_token_probs = 0;
        task_type          = SERVER_TASK_TYPE_COMPLETION;
        generated_tokens.clear();
@ -1764,7 +1921,7 @@ struct server_context {
    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
+        const std::string token_str = result.text_to_send;
        slot.sampled = result.tok;
        slot.generated_text += token_str;
@ -1774,26 +1931,7 @@ struct server_context {
        slot.has_next_token = true;
        // check if there is incomplete UTF-8 character at the end
-        bool incomplete = false;
+        bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size();
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
            if ((c & 0xC0) == 0x80) {
                // continuation byte: 10xxxxxx
                continue;
            }
            if ((c & 0xE0) == 0xC0) {
                // 2-byte character: 110xxxxx ...
                incomplete = i < 2;
            } else if ((c & 0xF0) == 0xE0) {
                // 3-byte character: 1110xxxx ...
                incomplete = i < 3;
            } else if ((c & 0xF8) == 0xF0) {
                // 4-byte character: 11110xxx ...
                incomplete = i < 4;
            }
            // else 1-byte character or invalid byte
            break;
        }
        // search stop word and delete it
        if (!incomplete) {
@ -1819,6 +1957,8 @@ struct server_context {
                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
                slot.n_sent_text += result.text_to_send.size();
                // add the token to slot queue and cache
            } else {
                result.text_to_send = "";
            }
            slot.add_token(result);
@ -1923,6 +2063,55 @@ struct server_context {
        return slot.has_next_token; // continue
    }
    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
        size_t n_probs = slot.params.sampling.n_probs;
        size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
        if (post_sampling) {
            const auto * cur_p = common_sampler_get_candidates(slot.smpl);
            const size_t max_probs = cur_p->size;
            // set probability for sampled token
            for (size_t i = 0; i < max_probs; i++) {
                if (cur_p->data[i].id == result.tok) {
                    result.prob = cur_p->data[i].p;
                    break;
                }
            }
            // set probability for top n_probs tokens
            result.probs.reserve(max_probs);
            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
                result.probs.push_back({
                    cur_p->data[i].id,
                    common_detokenize(ctx, {cur_p->data[i].id}, special),
                    cur_p->data[i].p
                });
            }
        } else {
            // TODO: optimize this with min-p optimization
            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
            // set probability for sampled token
            for (size_t i = 0; i < n_vocab; i++) {
                // set probability for sampled token
                if (cur[i].id == result.tok) {
                    result.prob = cur[i].p;
                    break;
                }
            }
            // set probability for top n_probs tokens
            result.probs.reserve(n_probs);
            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
                result.probs.push_back({
                    cur[i].id,
                    common_detokenize(ctx, {cur[i].id}, special),
                    cur[i].p
                });
            }
        }
    }
    void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
        send_error(task.id, error, type);
    }
@ -1952,26 +2141,16 @@ struct server_context {
        res->n_decoded           = slot.n_decoded;
        res->n_prompt_tokens     = slot.n_prompt_tokens;
        res->post_sampling_probs = slot.params.post_sampling_probs;
        res->verbose           = slot.params.verbose;
        res->oaicompat         = slot.params.oaicompat;
        res->oaicompat_chat    = slot.params.oaicompat_chat;
        res->oaicompat_model   = slot.params.oaicompat_model;
        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
        // populate res.probs_output
        if (slot.params.sampling.n_probs > 0) {
-            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            res->prob_output = tkn; // copy the token probs
            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
            const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
            std::vector<completion_token_output> probs_output;
            if (probs_pos < probs_stop_pos) {
                res->probs_output = std::vector<completion_token_output>(
                        slot.generated_token_probs.begin() + probs_pos,
                        slot.generated_token_probs.begin() + probs_stop_pos);
            }
        }
        // populate timings if this is final response or timings_per_token is enabled
@ -1992,6 +2171,7 @@ struct server_context {
        res->tokens          = slot.generated_tokens;
        res->timings         = slot.get_timings();
        res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
        res->response_fields = slot.params.response_fields;
        res->truncated           = slot.truncated;
        res->n_decoded           = slot.n_decoded;
@ -2000,11 +2180,11 @@ struct server_context {
        res->has_new_line        = slot.has_new_line;
        res->stopping_word       = slot.stopping_word;
        res->stop                = slot.stop;
        res->post_sampling_probs = slot.params.post_sampling_probs;
        res->verbose           = slot.params.verbose;
        res->stream            = slot.params.stream;
        res->oaicompat         = slot.params.oaicompat;
        res->oaicompat_chat    = slot.params.oaicompat_chat;
        res->oaicompat_model   = slot.params.oaicompat_model;
        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
@ -2796,7 +2976,9 @@ struct server_context {
                    continue; // continue loop of slots
                }
-                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                const int tok_idx = slot.i_batch - i;
                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
                slot.i_batch = -1;
@ -2816,16 +2998,11 @@ struct server_context {
                completion_token_output result;
                result.tok          = id;
                result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
                result.prob         = 1.0f; // TODO: set it here instead of doing inside populate_token_probs
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+                if (slot.params.sampling.n_probs > 0) {
-
+                    populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx);
                for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
                    auto tok_id = cur_p->data[i].id;
                    result.probs.push_back({
                        tok_id,
                        tokens_to_output_formatted_string(ctx, tok_id),
                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                    });
                }
                if (!process_token(result, slot)) {
@ -2910,6 +3087,10 @@ struct server_context {
                    completion_token_output result;
                    result.tok          = ids[i];
                    result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special);
                    result.prob         = 1.0f; // set later
                    // TODO: set result.probs
                    if (!process_token(result, slot)) {
                        // release slot because of stop condition
@ -3403,6 +3584,7 @@ int main(int argc, char ** argv) {
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
            { "build_info",                  build_info },
        };
        res_ok(res, data);
@ -3423,12 +3605,11 @@ int main(int argc, char ** argv) {
    // handle completion-like requests (completion, chat, infill)
    // we can optionally provide a custom format for partial results and final results
-    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](
+    const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
            server_task_type type,
            json & data,
            httplib::Response & res,
-            bool oaicompat = false,
+            oaicompat_type oaicompat) {
            bool oaicompat_chat = false) {
        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
        if (ctx_server.params_base.embedding) {
@ -3454,7 +3635,6 @@ int main(int argc, char ** argv) {
                // OAI-compat
                task.params.oaicompat         = oaicompat;
                task.params.oaicompat_chat      = oaicompat_chat;
                task.params.oaicompat_cmpl_id = completion_id;
                // oaicompat_model is already populated by params_from_json_cmpl
@ -3506,7 +3686,7 @@ int main(int argc, char ** argv) {
                }, [&](const json & error_data) {
                    server_sent_event(sink, "error", error_data);
                });
-                if (oaicompat) {
+                if (oaicompat != OAICOMPAT_TYPE_NONE) {
                    static const std::string ev_done = "data: [DONE]\n\n";
                    sink.write(ev_done.data(), ev_done.size());
                }
@ -3522,17 +3702,25 @@ int main(int argc, char ** argv) {
        }
    };
-    const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        json data = json::parse(req.body);
-        return handle_completions_generic(
+        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
            res,
-            /* oaicompat */ false,
+            OAICOMPAT_TYPE_NONE);
            /* oaicompat_chat */ false);
    };
-    const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        json data = oaicompat_completion_params_parse(json::parse(req.body));
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
            res,
            OAICOMPAT_TYPE_COMPLETION);
    };
    const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        // check model compatibility
        std::string err;
        if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
@ -3601,22 +3789,25 @@ int main(int argc, char ** argv) {
            tokenized_prompts[0]
        );
-        return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res);
+        return handle_completions_impl(
            SERVER_TASK_TYPE_INFILL,
            data,
            res,
            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
    };
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+        json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
-        return handle_completions_generic(
+        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
            res,
-            /* oaicompat */ true,
+            OAICOMPAT_TYPE_CHAT);
            /* oaicompat_chat */ true);
    };
    const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
@ -3624,7 +3815,7 @@ int main(int argc, char ** argv) {
            {"object", "list"},
            {"data", {
                {
-                    {"id",       params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
                    {"object",   "model"},
                    {"created",  std::time(0)},
                    {"owned_by", "llamacpp"},
@ -3689,10 +3880,10 @@ int main(int argc, char ** argv) {
        res_ok(res, data);
    };
-    const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) {
+    const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
        const json body = json::parse(req.body);
-        if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
            return;
        }
@ -3702,13 +3893,24 @@ int main(int argc, char ** argv) {
        if (body.count("input") != 0) {
            prompt = body.at("input");
        } else if (body.contains("content")) {
-            oaicompat = false;
+            oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
            prompt = body.at("content");
        } else {
            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
            return;
        }
        bool use_base64 = false;
        if (body.count("encoding_format") != 0) {
            const std::string& format = body.at("encoding_format");
            if (format == "base64") {
                use_base64 = true;
            } else if (format != "float") {
                res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
                return;
            }
        }
        std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
        for (const auto & tokens : tokenized_prompts) {
            // this check is necessary for models that do not add BOS token to the input
@ -3760,16 +3962,18 @@ int main(int argc, char ** argv) {
        }
        // write JSON response
-        json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
+        json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
            ? format_embeddings_response_oaicompat(body, responses, use_base64)
            : json(responses);
        res_ok(res, root);
    };
    const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, false);
+        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
    };
    const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, true);
+        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
    };
    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
@ -3939,7 +4143,7 @@ int main(int argc, char ** argv) {
    svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
    svr->Post("/completion",          handle_completions); // legacy
    svr->Post("/completions",         handle_completions);
-    svr->Post("/v1/completions",      handle_completions);
+    svr->Post("/v1/completions",      handle_completions_oai);
    svr->Post("/chat/completions",    handle_chat_completions);
    svr->Post("/v1/chat/completions", handle_chat_completions);
    svr->Post("/infill",              handle_infill);
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
    })
    assert res.status_code == 200
    assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
    assert res.body["system_fingerprint"].startswith("b")
    assert res.body["model"] == model if model is not None else server.model_alias
    assert res.body["usage"]["prompt_tokens"] == n_prompt
    assert res.body["usage"]["completion_tokens"] == n_predicted
@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
    last_cmpl_id = None
    for data in res:
        choice = data["choices"][0]
        assert data["system_fingerprint"].startswith("b")
        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
        if last_cmpl_id is None:
            last_cmpl_id = data["id"]
@ -81,7 +83,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
 def test_chat_completion_with_openai_library():
    global server
    server.start()
-    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.chat.completions.create(
        model="gpt-3.5-turbo-instruct",
        messages=[
@ -92,7 +94,7 @@ def test_chat_completion_with_openai_library():
        seed=42,
        temperature=0.8,
    )
-    print(res)
+    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
    assert res.choices[0].finish_reason == "length"
    assert res.choices[0].message.content is not None
    assert match_regex("(Suddenly)+", res.choices[0].message.content)
@ -163,3 +165,64 @@ def test_chat_completion_with_timings_per_token():
        assert "predicted_per_second" in data["timings"]
        assert "predicted_n" in data["timings"]
        assert data["timings"]["predicted_n"] <= 10
 def test_logprobs():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.chat.completions.create(
        model="gpt-3.5-turbo-instruct",
        temperature=0.0,
        messages=[
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        max_tokens=5,
        logprobs=True,
        top_logprobs=10,
    )
    output_text = res.choices[0].message.content
    aggregated_text = ''
    assert res.choices[0].logprobs is not None
    assert res.choices[0].logprobs.content is not None
    for token in res.choices[0].logprobs.content:
        aggregated_text += token.token
        assert token.logprob <= 0.0
        assert token.bytes is not None
        assert len(token.top_logprobs) > 0
    assert aggregated_text == output_text
 def test_logprobs_stream():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.chat.completions.create(
        model="gpt-3.5-turbo-instruct",
        temperature=0.0,
        messages=[
            {"role": "system", "content": "Book"},
            {"role": "user", "content": "What is the best book"},
        ],
        max_tokens=5,
        logprobs=True,
        top_logprobs=10,
        stream=True,
    )
    output_text = ''
    aggregated_text = ''
    for data in res:
        choice = data.choices[0]
        if choice.finish_reason is None:
            if choice.delta.content:
                output_text += choice.delta.content
            assert choice.logprobs is not None
            assert choice.logprobs.content is not None
            for token in choice.logprobs.content:
                aggregated_text += token.token
                assert token.logprob <= 0.0
                assert token.bytes is not None
                assert token.top_logprobs is not None
                assert len(token.top_logprobs) > 0
    assert aggregated_text == output_text
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@ -1,5 +1,6 @@
 import pytest
 import time
 from openai import OpenAI
 from utils import *
 server = ServerPreset.tinyllama2()
@ -85,6 +86,40 @@ def test_completion_stream_vs_non_stream():
    assert content_stream == res_non_stream.body["content"]
 def test_completion_stream_with_openai_library():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.completions.create(
        model="davinci-002",
        prompt="I believe the meaning of life is",
        max_tokens=8,
    )
    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
    assert res.choices[0].finish_reason == "length"
    assert res.choices[0].text is not None
    assert match_regex("(going|bed)+", res.choices[0].text)
 def test_completion_with_openai_library():
    global server
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.completions.create(
        model="davinci-002",
        prompt="I believe the meaning of life is",
        max_tokens=8,
        stream=True,
    )
    output_text = ''
    for data in res:
        choice = data.choices[0]
        if choice.finish_reason is None:
            assert choice.text is not None
            output_text += choice.text
    assert match_regex("(going|bed)+", output_text)
@pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
    global server
@ -95,7 +130,7 @@ def test_consistent_result_same_seed(n_slots: int):
        res = server.make_request("POST", "/completion", data={
            "prompt": "I believe the meaning of life is",
            "seed": 42,
-            "temperature": 1.0,
+            "temperature": 0.0,
            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
        })
        if last_res is not None:
@ -120,9 +155,10 @@ def test_different_result_different_seed(n_slots: int):
            assert res.body["content"] != last_res.body["content"]
        last_res = res
-
+# TODO figure why it don't work with temperature = 1
 # @pytest.mark.parametrize("temperature", [0.0, 1.0])
@pytest.mark.parametrize("n_batch", [16, 32])
-@pytest.mark.parametrize("temperature", [0.0, 1.0])
+@pytest.mark.parametrize("temperature", [0.0])
 def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
    global server
    server.n_batch = n_batch
@ -257,6 +293,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
        # assert match_regex(re_content, res.body["content"])
@pytest.mark.parametrize(
    "prompt,n_predict,response_fields",
    [
        ("I believe the meaning of life is", 8, []),
        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
    ],
 )
 def test_completion_response_fields(
    prompt: str, n_predict: int, response_fields: list[str]
 ):
    global server
    server.start()
    res = server.make_request(
        "POST",
        "/completion",
        data={
            "n_predict": n_predict,
            "prompt": prompt,
            "response_fields": response_fields,
        },
    )
    assert res.status_code == 200
    assert "content" in res.body
    assert len(res.body["content"])
    if len(response_fields):
        assert res.body["generation_settings/n_predict"] == n_predict
        assert res.body["prompt"] == "<s> " + prompt
        assert isinstance(res.body["content"], str)
        assert len(res.body) == len(response_fields)
    else:
        assert len(res.body)
        assert "generation_settings" in res.body
 def test_n_probs():
    global server
    server.start()
@ -270,9 +340,68 @@ def test_n_probs():
    assert "completion_probabilities" in res.body
    assert len(res.body["completion_probabilities"]) == 5
    for tok in res.body["completion_probabilities"]:
-        assert "probs" in tok
+        assert "id" in tok and tok["id"] > 0
-        assert len(tok["probs"]) == 10
+        assert "token" in tok and type(tok["token"]) == str
-        for prob in tok["probs"]:
+        assert "logprob" in tok and tok["logprob"] <= 0.0
-            assert "prob" in prob
+        assert "bytes" in tok and type(tok["bytes"]) == list
-            assert "tok_str" in prob
+        assert len(tok["top_logprobs"]) == 10
-            assert 0.0 <= prob["prob"] <= 1.0
+        for prob in tok["top_logprobs"]:
            assert "id" in prob and prob["id"] > 0
            assert "token" in prob and type(prob["token"]) == str
            assert "logprob" in prob and prob["logprob"] <= 0.0
            assert "bytes" in prob and type(prob["bytes"]) == list
 def test_n_probs_stream():
    global server
    server.start()
    res = server.make_stream_request("POST", "/completion", data={
        "prompt": "I believe the meaning of life is",
        "n_probs": 10,
        "temperature": 0.0,
        "n_predict": 5,
        "stream": True,
    })
    for data in res:
        if data["stop"] == False:
            assert "completion_probabilities" in data
            assert len(data["completion_probabilities"]) == 1
            for tok in data["completion_probabilities"]:
                assert "id" in tok and tok["id"] > 0
                assert "token" in tok and type(tok["token"]) == str
                assert "logprob" in tok and tok["logprob"] <= 0.0
                assert "bytes" in tok and type(tok["bytes"]) == list
                assert len(tok["top_logprobs"]) == 10
                for prob in tok["top_logprobs"]:
                    assert "id" in prob and prob["id"] > 0
                    assert "token" in prob and type(prob["token"]) == str
                    assert "logprob" in prob and prob["logprob"] <= 0.0
                    assert "bytes" in prob and type(prob["bytes"]) == list
 def test_n_probs_post_sampling():
    global server
    server.start()
    res = server.make_request("POST", "/completion", data={
        "prompt": "I believe the meaning of life is",
        "n_probs": 10,
        "temperature": 0.0,
        "n_predict": 5,
        "post_sampling_probs": True,
    })
    assert res.status_code == 200
    assert "completion_probabilities" in res.body
    assert len(res.body["completion_probabilities"]) == 5
    for tok in res.body["completion_probabilities"]:
        assert "id" in tok and tok["id"] > 0
        assert "token" in tok and type(tok["token"]) == str
        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
        assert "bytes" in tok and type(tok["bytes"]) == list
        assert len(tok["top_probs"]) == 10
        for prob in tok["top_probs"]:
            assert "id" in prob and prob["id"] > 0
            assert "token" in prob and type(prob["token"]) == str
            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
            assert "bytes" in prob and type(prob["bytes"]) == list
        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@ -1,3 +1,5 @@
 import base64
 import struct
 import pytest
 from openai import OpenAI
 from utils import *
@ -50,6 +52,8 @@ def test_embedding_multiple():
@pytest.mark.parametrize(
    "input,is_multi_prompt",
    [
        # do not crash on empty input
        ("", False),
        # single prompt
        ("string", False),
        ([12, 34, 56], False),
@ -103,6 +107,7 @@ def test_embedding_pooling_none_oai():
    # /v1/embeddings does not support pooling type 'none'
    assert res.status_code == 400
    assert "error" in res.body
 def test_embedding_openai_library_single():
@ -191,3 +196,42 @@ def test_embedding_usage_multiple():
    assert res.status_code == 200
    assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
    assert res.body['usage']['prompt_tokens'] == 2 * 9
 def test_embedding_openai_library_base64():
    server.start()
    test_input = "Test base64 embedding output"
    # get embedding in default format
    res = server.make_request("POST", "/v1/embeddings", data={
        "input": test_input
    })
    assert res.status_code == 200
    vec0 = res.body["data"][0]["embedding"]
    # get embedding in base64 format
    res = server.make_request("POST", "/v1/embeddings", data={
        "input": test_input,
        "encoding_format": "base64"
    })
    assert res.status_code == 200
    assert "data" in res.body
    assert len(res.body["data"]) == 1
    embedding_data = res.body["data"][0]
    assert "embedding" in embedding_data
    assert isinstance(embedding_data["embedding"], str)
    # Verify embedding is valid base64
    decoded = base64.b64decode(embedding_data["embedding"])
    # Verify decoded data can be converted back to float array
    float_count = len(decoded) // 4  # 4 bytes per float
    floats = struct.unpack(f'{float_count}f', decoded)
    assert len(floats) > 0
    assert all(isinstance(x, float) for x in floats)
    assert len(floats) == len(vec0)
    # make sure the decoded data is the same as the original
    for x, y in zip(floats, vec0):
        assert abs(x - y) < EPSILON
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include "common/base64.hpp"
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@ -56,6 +57,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 //
 // tokenizer and input processing utils
 //
@ -88,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
    return false;
 }
 // get value by path(key1 / key2)
 static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
    json result = json::object();
    for (const std::string & path : paths) {
        json current = js;
        const auto keys = string_split<std::string>(path, /*separator*/ '/');
        bool valid_path = true;
        for (const std::string & k : keys) {
            if (valid_path && current.is_object() && current.contains(k)) {
                current = current[k];
            } else {
                valid_path = false;
            }
        }
        if (valid_path) {
            result[path] = current;
        }
    }
    return result;
 }
 /**
 * this handles 2 cases:
 * - only string, example: "string"
@ -171,6 +196,36 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
    return result;
 }
 // return the last index of character that can form a valid string
 // if the last character is potentially cut in half, return the index before the cut
 // if validate_utf8(text) == text.size(), then the whole text is valid utf8
 static size_t validate_utf8(const std::string& text) {
    size_t len = text.size();
    if (len == 0) return 0;
    // Check the last few bytes to see if a multi-byte character is cut off
    for (size_t i = 1; i <= 4 && i <= len; ++i) {
        unsigned char c = text[len - i];
        // Check for start of a multi-byte sequence from the end
        if ((c & 0xE0) == 0xC0) {
            // 2-byte character start: 110xxxxx
            // Needs at least 2 bytes
            if (i < 2) return len - i;
        } else if ((c & 0xF0) == 0xE0) {
            // 3-byte character start: 1110xxxx
            // Needs at least 3 bytes
            if (i < 3) return len - i;
        } else if ((c & 0xF8) == 0xF0) {
            // 4-byte character start: 11110xxx
            // Needs at least 4 bytes
            if (i < 4) return len - i;
        }
    }
    // If no cut-off multi-byte character is found, return full length
    return len;
 }
 //
 // template utils
 //
@ -494,7 +549,46 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 // OAI utils
 //
-static json oaicompat_completion_params_parse(
+static json oaicompat_completion_params_parse(const json & body) {
    json llama_params;
    if (!body.contains("prompt")) {
        throw std::runtime_error("\"prompt\" is required");
    }
    // Handle "stop" field
    if (body.contains("stop") && body.at("stop").is_string()) {
        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
    // Handle "n" field
    int n_choices = json_value(body, "n", 1);
    if (n_choices != 1) {
        throw std::runtime_error("Only one completion choice is allowed");
    }
    // Params supported by OAI but unsupported by llama.cpp
    static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
    for (const auto & param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
        }
    }
    // Copy remaining properties to llama_params
    for (const auto & item : body.items()) {
        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
            llama_params[item.key()] = item.value();
        }
    }
    return llama_params;
 }
 static json oaicompat_chat_completion_params_parse(
        const struct llama_model * model,
        const json & body, /* openai api json semantics */
        const std::string & chat_template) {
@ -559,16 +653,31 @@ static json oaicompat_completion_params_parse(
    return llama_params;
 }
-static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
+static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
    json data = json::array();
    int32_t n_tokens = 0;
    int i = 0;
    for (const auto & elem : embeddings) {
-        data.push_back(json{
+        json embedding_obj;
        if (use_base64) {
            const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
            const char* data_ptr = reinterpret_cast<const char*>(vec.data());
            size_t data_size = vec.size() * sizeof(float);
            embedding_obj = {
                {"embedding", base64::encode(data_ptr, data_size)},
                {"index", i++},
                {"object", "embedding"},
                {"encoding_format", "base64"}
            };
        } else {
            embedding_obj = {
                {"embedding", json_value(elem, "embedding", json::array())},
                {"index", i++},
                {"object", "embedding"}
-        });
+            };
        }
        data.push_back(embedding_obj);
        n_tokens += json_value(elem, "tokens_evaluated", 0);
    }
@ -671,3 +780,33 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
 static std::string safe_json_to_str(json data) {
    return data.dump(-1, ' ', false, json::error_handler_t::replace);
 }
 static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
    std::vector<llama_token_data> cur;
    const auto * logits = llama_get_logits_ith(ctx, idx);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    cur.resize(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }
    // sort tokens by logits
    std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) {
        return a.logit > b.logit;
    });
    // apply softmax
    float max_l = cur[0].logit;
    float cum_sum = 0.0f;
    for (size_t i = 0; i < cur.size(); ++i) {
        float p = expf(cur[i].logit - max_l);
        cur[i].p = p;
        cum_sum += p;
    }
    for (size_t i = 0; i < cur.size(); ++i) {
        cur[i].p /= cum_sum;
    }
    return cur;
 }
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -13,7 +13,7 @@ import hljs from './highlight-config';
 import daisyuiThemes from 'daisyui/src/theming/themes';
 // ponyfill for missing ReadableStream asyncIterator on Safari
-import { asyncIterator } from "@sec-ant/readable-stream/ponyfill/asyncIterator";
+import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
 const isDev = import.meta.env.MODE === 'development';
@ -22,7 +22,22 @@ const isString = (x) => !!x.toLowerCase;
 const isBoolean = (x) => x === true || x === false;
 const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
 const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
-const copyStr = (str) => navigator.clipboard.writeText(str);
+const copyStr = (textToCopy) => {
  // Navigator clipboard api needs a secure context (https)
  if (navigator.clipboard && window.isSecureContext) {
    navigator.clipboard.writeText(textToCopy);
  } else {
    // Use the 'out of viewport hidden text area' trick
    const textArea = document.createElement('textarea');
    textArea.value = textToCopy;
    // Move textarea out of the viewport so it's not visible
    textArea.style.position = 'absolute';
    textArea.style.left = '-999999px';
    document.body.prepend(textArea);
    textArea.select();
    document.execCommand('copy');
  }
 };
 // constants
 const BASE_URL = isDev
@ -130,9 +145,9 @@ const VueMarkdown = defineComponent(
    };
    window.copyStr = copyStr;
    const content = computed(() => md.value.render(props.source));
-    return () => h("div", { innerHTML: content.value });
+    return () => h('div', { innerHTML: content.value });
  },
-  { props: ["source"] }
+  { props: ['source'] }
 );
 // input field to be used by settings modal
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
        # write the shared library to the output directory
        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
        add_dependencies(ggml ${backend})
    else()
        add_library(${backend} ${ARGN})
        target_link_libraries(ggml PUBLIC ${backend})
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -66,6 +66,26 @@
 #include "ggml-kompute.h"
 #endif
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #endif
 static std::wstring utf8_to_utf16(const std::string & str) {
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
    return converter.from_bytes(str);
 }
 static std::string utf16_to_utf8(const std::wstring & str) {
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
    return converter.to_bytes(str);
 }
 #if defined(__clang__)
 #    pragma clang diagnostic pop
 #endif
 #ifdef _WIN32
 using dl_handle = std::remove_pointer_t<HMODULE>;
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
    return handle;
 }
 static dl_handle * dl_load_library(const std::string & path) {
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
    return dl_load_library(converter.from_bytes(path));
 }
 static void * dl_get_sym(dl_handle * handle, const char * name) {
    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@ -114,8 +129,8 @@ struct dl_handle_deleter {
    }
 };
-static void * dl_load_library(const std::string & path) {
+static void * dl_load_library(const std::wstring & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
    return handle;
 }
@ -202,11 +217,11 @@ struct ggml_backend_registry {
        devices.push_back(device);
    }
-    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@ -214,7 +229,7 @@ struct ggml_backend_registry {
        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
        if (score_fn && score_fn() == 0) {
            if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@ -222,7 +237,7 @@ struct ggml_backend_registry {
        auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
        if (!backend_init_fn) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
            }
            return nullptr;
        }
@ -231,16 +246,16 @@ struct ggml_backend_registry {
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
            return nullptr;
        }
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
        register_backend(reg, std::move(handle));
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
+    return get_reg().load_backend(utf8_to_utf16(path), false);
 }
 void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }
-static std::string get_executable_path() {
+static std::wstring get_executable_path() {
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
@ -401,13 +416,17 @@ static std::string get_executable_path() {
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
-#elif defined(__linux__)
+#elif defined(__linux__) || defined(__FreeBSD__)
    std::string base_path = ".";
    std::vector<char> path(1024);
    while (true) {
        // get executable path
 #    if defined(__linux__)
        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
 #    elif defined(__FreeBSD__)
        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
 #    endif
        if (len == -1) {
            break;
        }
@ -423,57 +442,63 @@ static std::string get_executable_path() {
        path.resize(path.size() * 2);
    }
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
-    std::vector<char> path(MAX_PATH);
+    std::vector<wchar_t> path(MAX_PATH);
-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
    if (len == 0) {
-        return "";
+        return {};
    }
-    std::string base_path(path.data(), len);
+    std::wstring base_path(path.data(), len);
    // remove executable name
    auto last_slash = base_path.find_last_of('\\');
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
-    return base_path + "\\";
+    return base_path + L"\\";
 #else
    return {};
 #endif
 }
-static std::string backend_filename_prefix() {
+static std::wstring backend_filename_prefix() {
 #ifdef _WIN32
-    return "ggml-";
+    return L"ggml-";
 #else
-    return "libggml-";
+    return L"libggml-";
 #endif
 }
-static std::string backend_filename_suffix() {
+static std::wstring backend_filename_suffix() {
 #ifdef _WIN32
-    return ".dll";
+    return L".dll";
 #else
-    return ".so";
+    return L".so";
 #endif
 }
 static std::wstring path_separator() {
 #ifdef _WIN32
    return L"\\";
 #else
    return L"/";
 #endif
 }
 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
    // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
     // TODO: search system paths
-    std::string file_prefix = backend_filename_prefix() + name + "-";
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
-    std::vector<std::string> search_paths;
+    std::vector<std::wstring> search_paths;
    if (user_search_path == nullptr) {
-        search_paths.push_back("./");
+        search_paths.push_back(L"." + path_separator());
        search_paths.push_back(get_executable_path());
    } else {
-#if defined(_WIN32)
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
        search_paths.push_back(std::string(user_search_path) + "\\");
 #else
        search_paths.push_back(std::string(user_search_path) + "/");
 #endif
    }
    int best_score = 0;
-    std::string best_path;
+    std::wstring best_path;
    namespace fs = std::filesystem;
    for (const auto & search_path : search_paths) {
@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
        fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
        for (const auto & entry : dir_it) {
            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
+                std::wstring filename = entry.path().filename().wstring();
-                std::string ext = entry.path().extension().string();
+                std::wstring ext = entry.path().extension().wstring();
                if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                    }
                    if (handle) {
                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
                        if (score_fn) {
                            int s = score_fn();
 #ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
 #endif
                            if (s > best_score) {
                                best_score = s;
-                                best_path = entry.path().string();
+                                best_path = entry.path().wstring();
                            }
                        } else {
                            if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                            }
                        }
                    }
@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
    if (best_score == 0) {
        // try to load the base backend
        for (const auto & search_path : search_paths) {
-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
            if (fs::exists(path)) {
-                return get_reg().load_backend(path.c_str(), silent);
+                return get_reg().load_backend(path, silent);
            }
        }
        return nullptr;
    }
-    return get_reg().load_backend(best_path.c_str(), silent);
+    return get_reg().load_backend(best_path, silent);
 }
 void ggml_backend_load_all() {
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -82,39 +82,52 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
        else()
-            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
            endif()
            if (GGML_NATIVE)
                list(APPEND ARCH_FLAGS -mcpu=native)
                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
                # -mcpu=native does not always enable all the features in some compilers,
                # so we check for them manually and enable them if available
                execute_process(
                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
                    INPUT_FILE "/dev/null"
                    OUTPUT_QUIET
                    ERROR_VARIABLE ARM_MCPU
                    RESULT_VARIABLE ARM_MCPU_RESULT
                )
                if (NOT ARM_MCPU_RESULT)
                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
                endif()
                if ("${ARM_MCPU_FLAG}" STREQUAL "")
                    set(ARM_MCPU_FLAG -mcpu=native)
                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
                endif()
                include(CheckCXXSourceRuns)
-                set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+dotprod")
+                function(check_arm_feature tag code)
                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
                    check_cxx_source_runs(
-                    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }"
+                        "${code}"
-                    GGML_COMPILER_SUPPORT_DOTPROD)
+                        GGML_MACHINE_SUPPORTS_${tag}
-                if (GGML_COMPILER_SUPPORT_DOTPROD)
+                    )
-                    set(ARCH_FLAGS "${ARCH_FLAGS}+dotprod")
+                    if (GGML_MACHINE_SUPPORTS_${tag})
                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
                    else()
                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
                    endif()
                set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+i8mm")
                check_cxx_source_runs(
                    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }"
                    GGML_COMPILER_SUPPORT_I8MM)
                if (GGML_COMPILER_SUPPORT_I8MM)
                    set(ARCH_FLAGS "${ARCH_FLAGS}+i8mm")
                endif()
                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
                endfunction()
                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
            else()
                if (GGML_CPU_ARM_ARCH)
                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
@ -122,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            endif()
            # show enabled features
            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
                set(FEAT_INPUT_FILE "NUL")
            else()
                set(FEAT_INPUT_FILE "/dev/null")
            endif()
            execute_process(
                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
-                INPUT_FILE "/dev/null"
+                INPUT_FILE ${FEAT_INPUT_FILE}
                OUTPUT_VARIABLE ARM_FEATURE
                RESULT_VARIABLE ARM_FEATURE_RESULT
            )
            if (ARM_FEATURE_RESULT)
-                message(FATAL_ERROR "Failed to get ARM features")
+                message(WARNING "Failed to get ARM features")
            else()
                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
@ -304,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
    if (GGML_BACKEND_DL)
        if (GGML_NATIVE)
            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
        endif()
        # The feature detection code is compiled as a separate target so that
        # it can be built without the architecture flags
        # Since multiple variants of the CPU backend may be included in the same
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@ -567,21 +567,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
        for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
            float32x4_t acc = vdupq_n_f32(0);
            for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
                int8x16_t a0 = vld1q_s8(a_ptr->qs);
                int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
                int32x4_t ret = vdupq_n_s32(0);
@ -650,72 +650,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
    UNUSED(ncols_interleaved);
    UNUSED(blocklen);
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const void * b_ptr = vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
-        __asm__ __volatile__(
+        for (int c = 0; c < nc; c += ncols_interleaved) {
-            "movi v2.16b, #0x4\n"
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
-            "movi v1.16b, #0xf0\n"
+            float32x4_t acc = vdupq_n_f32(0);
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
+            for (int b = 0; b < nb; b++) {
-            "1:"  // Column loop
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
-            "add x23, %x[a_ptr], #0x2\n"
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
-            "movi v0.16b, #0x0\n"
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
-            "mov x22, %x[nb]\n"
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
-            "2:"  // Block loop
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
-            "ldr q31, [%x[b_ptr], #0x0]\n"
+
-            "ldr q30, [%x[b_ptr], #0x10]\n"
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
-            "mov x21, x23\n"
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
-            "movi v29.4s, #0x0\n"
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
-            "ldr q28, [%x[b_ptr], #0x20]\n"
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
-            "ldr q27, [%x[b_ptr], #0x30]\n"
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
-            "movi v26.4s, #0x0\n"
+
-            "sub x20, x23, #0x2\n"
+                int32x4_t ret0 = vdupq_n_s32(0);
-            "ld1r { v25.8h }, [x20]\n"
+                int32x4_t ret1 = vdupq_n_s32(0);
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
+
-            "sub x22, x22, #0x1\n"
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
-            "add x23, x23, #0x22\n"
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
-            "ld1r { v23.2d }, [x21], #0x8\n"
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
-            "sshl v22.16b, v31.16b, v2.16b\n"
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
-            "sshl v16.16b, v30.16b, v2.16b\n"
+
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
-            "ld1r { v21.2d }, [x21], #0x8\n"
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
-            "sshl v20.16b, v28.16b, v2.16b\n"
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
-            "sshl v19.16b, v27.16b, v2.16b\n"
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
-            "ld1r { v18.2d }, [x21], #0x8\n"
+
-            "ld1r { v17.2d }, [x21], #0x8\n"
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
-            "and v31.16b, v31.16b, v1.16b\n"
+
-            "and v30.16b, v30.16b, v1.16b\n"
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
+                a_ptr++;
-            "and v28.16b, v28.16b, v1.16b\n"
+                b_ptr++;
-            "and v27.16b, v27.16b, v1.16b\n"
+            }
-            "fcvtl v25.4s, v25.4h\n"
+            vst1q_f32(s, acc);
-            "fcvtl v16.4s, v24.4h\n"
+            s += ncols_interleaved;
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
+        }
            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
            "fmul v16.4s, v16.4s, v25.4s\n"
            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
            "addp v29.4s, v29.4s, v26.4s\n"
            "scvtf v29.4s, v29.4s, #0x4\n"
            "fmla v0.4s, v29.4s, v16.4s\n"
            "cbnz x22, 2b\n"
            "sub %x[nc], %x[nc], #0x4\n"
            "str q0, [%x[res_ptr], #0x0]\n"
            "add %x[res_ptr], %x[res_ptr], #0x10\n"
            "cbnz %x[nc], 1b\n"
            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
        );
        return;
    }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
    float sumf[4];
    int sumi;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
    float tmp[4];
    tmp[0] = GGML_FP16_TO_FP32(x[0]);
@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
    return _mm_loadu_ps(tmp);
 }
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
    float arr[4];
    _mm_storeu_ps(arr, y);
@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
    if (src1_cont) {
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                     nb01/ggml_type_size(src0->type),
                                     (const char *)src1->data + i12*nb12 + i13*nb13,
                                     nb11/ggml_type_size(src1->type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
                                     src0->type,
                                     src1->type,
                                     dst->type))
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                     nb01/ggml_type_size(src0->type),
                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                     row_size/ggml_type_size(vec_dot_type),
                                     (char *)dst->data + i12*nb2 + i13*nb3,
                                     nb1/ggml_type_size(dst->type),
                                     ith, nth,
                                     src0->type,
                                     vec_dot_type,
                                     dst->type))
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -53,6 +53,8 @@
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 #include <atomic>
 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
 #else
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
    return _mm512_fmadd_ps(a, b, c);
 }
 #endif
 #if defined(__AVX512BF16__)
 template <>
 inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
    return _mm512_dpbf16_ps(c, a, b);
 }
 template <>
 inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
    return _mm256_dpbf16_ps(c, a, b);
 }
 #endif
 #endif
 #if defined(__ARM_FEATURE_FMA)
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
 }
 #endif // __AVX__
 #if defined(__AVX2__) || defined(__AVX512F__)
 template <> inline __m256 load(const ggml_bf16_t *p) {
    return _mm256_castsi256_ps(
        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
 }
 #endif // __AVX2__
 #if defined(__F16C__)
 template <> inline __m256 load(const ggml_fp16_t *p) {
    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
 template <> inline __m512 load(const ggml_fp16_t *p) {
    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
 }
 template <> inline __m512 load(const ggml_bf16_t *p) {
    return _mm512_castsi512_ps(
        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
 }
 #endif // __AVX512F__
 #if defined(__AVX512BF16__)
 template <> inline __m512bh load(const ggml_bf16_t *p) {
    return (__m512bh)_mm512_loadu_ps((const float *)p);
 }
 template <> inline __m256bh load(const ggml_bf16_t *p) {
    return (__m256bh)_mm256_loadu_ps((const float *)p);
 }
 template <> inline __m512bh load(const float *p) {
    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
 }
 template <> inline __m256bh load(const float *p) {
    return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
 }
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CONSTANTS
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION
 template <int M>
 static inline int64_t BLOCK_SIZE(size_t m) {
    const int64_t NB_BLOC_M = (m + M - 1) / M;
    return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
 }
 static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
    return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
 }
 template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
 class tinyBLAS {
  public:
-    tinyBLAS(int64_t k,
+    tinyBLAS(const ggml_compute_params * params, int64_t k,
             const TA *A, int64_t lda,
             const TB *B, int64_t ldb,
-             TC *C, int64_t ldc,
+             TC *C, int64_t ldc)
-             int ith, int nth)
+        : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }
-    void matmul(int64_t m, int64_t n) {
+    bool matmul(int64_t m, int64_t n) {
-        mnpack(0, m, 0, n);
+        if (k % KN != 0)
            return false;
        // compute RM for only need tile with size RM&RM-1
 #if VECTOR_REGISTERS == 32
        if (m % 16 == 0 && (m/16 >= params->nth)) {
            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
            mnpack<4, 6, 4>(m, n, SIZE_N, 12);
            return true;
        }
        if (m % 8 == 0 ) {
            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
            mnpack<4, 6, 2>(m, n, SIZE_N, 12);
            return true;
        }
        if (m % 4 == 0) {
            const int64_t SIZE_N = BLOCK_SIZE<6>(n);
            mnpack<4, 6, 1>(m, n, SIZE_N, 12);
            return true;
        }
 #else  // VECTOR_REGISTERS == 16
        if (m % 16 == 0 && (m/16 >= params->nth)) {
            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
            mnpack<4, 3, 4>(m, n, SIZE_N, 24);
            return true;
        }
        if (m % 8 == 0 ) {
            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
            mnpack<4, 3, 2>(m, n, SIZE_N, 24);
            return true;
        }
        if (m % 4 == 0) {
            const int64_t SIZE_N = BLOCK_SIZE<3>(n);
            mnpack<4, 3, 1>(m, n, SIZE_N, 24);
            return true;
        }
 #endif
        return false;
    }
  private:
-    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+    template <int RM, int RN, int BM>
-        int64_t mc, nc, mp, np;
+    inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
-        switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
+        if (SIZE_N == RN) {
-#if VECTOR_REGISTERS == 32
+            return gemm<RM, RN, BM>(m, n, BN);
-        case 0x55:
+        }
-            mc = 5;
+        if constexpr (RN > 1) {
-            nc = 5;
+            return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
-            gemm<5, 5>(m0, m, n0, n);
+        } else {
-            break;
+            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
-        case 0x45:
+            GGML_ASSERT(false); // we have miss something.
            mc = 4;
            nc = 5;
            gemm<4, 5>(m0, m, n0, n);
            break;
        case 0x54:
            mc = 5;
            nc = 4;
            gemm<5, 4>(m0, m, n0, n);
            break;
        case 0x44:
            mc = 4;
            nc = 4;
            gemm<4, 4>(m0, m, n0, n);
            break;
        case 0x53:
            mc = 5;
            nc = 3;
            gemm<5, 3>(m0, m, n0, n);
            break;
        case 0x35:
            mc = 3;
            nc = 5;
            gemm<3, 5>(m0, m, n0, n);
            break;
        case 0x43:
            mc = 4;
            nc = 3;
            gemm<4, 3>(m0, m, n0, n);
            break;
 #else
        case 0x55:
        case 0x54:
        case 0x53:
        case 0x45:
        case 0x44:
        case 0x43:
            mc = 4;
            nc = 3;
            gemm<4, 3>(m0, m, n0, n);
            break;
        case 0x35:
 #endif
        case 0x34:
            mc = 3;
            nc = 4;
            gemm<3, 4>(m0, m, n0, n);
            break;
        case 0x52:
            mc = 5;
            nc = 2;
            gemm<5, 2>(m0, m, n0, n);
            break;
        case 0x33:
            mc = 3;
            nc = 3;
            gemm<3, 3>(m0, m, n0, n);
            break;
        case 0x25:
            mc = 2;
            nc = 5;
            gemm<2, 5>(m0, m, n0, n);
            break;
        case 0x42:
            mc = 4;
            nc = 2;
            gemm<4, 2>(m0, m, n0, n);
            break;
        case 0x24:
            mc = 2;
            nc = 4;
            gemm<2, 4>(m0, m, n0, n);
            break;
        case 0x32:
            mc = 3;
            nc = 2;
            gemm<3, 2>(m0, m, n0, n);
            break;
        case 0x23:
            mc = 2;
            nc = 3;
            gemm<2, 3>(m0, m, n0, n);
            break;
        case 0x51:
            mc = 5;
            nc = 1;
            gemm<5, 1>(m0, m, n0, n);
            break;
        case 0x41:
            mc = 4;
            nc = 1;
            gemm<4, 1>(m0, m, n0, n);
            break;
        case 0x22:
            mc = 2;
            nc = 2;
            gemm<2, 2>(m0, m, n0, n);
            break;
        case 0x15:
            mc = 1;
            nc = 5;
            gemm<1, 5>(m0, m, n0, n);
            break;
        case 0x14:
            mc = 1;
            nc = 4;
            gemm<1, 4>(m0, m, n0, n);
            break;
        case 0x31:
            mc = 3;
            nc = 1;
            gemm<3, 1>(m0, m, n0, n);
            break;
        case 0x13:
            mc = 1;
            nc = 3;
            gemm<1, 3>(m0, m, n0, n);
            break;
        case 0x21:
            mc = 2;
            nc = 1;
            gemm<2, 1>(m0, m, n0, n);
            break;
        case 0x12:
            mc = 1;
            nc = 2;
            gemm<1, 2>(m0, m, n0, n);
            break;
        case 0x11:
            mc = 1;
            nc = 1;
            gemm<1, 1>(m0, m, n0, n);
            break;
        default:
            return;
        }
        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }
    template <int RM, int RN>
-    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+    inline void gemm_bloc(int64_t ii, int64_t jj) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
        D Cv[RN][RM] = {};
-            for (int64_t l = 0; l < k; l += KN)
+        for (int64_t l = 0; l < k; l += KN) {
-                for (int64_t j = 0; j < RN; ++j)
+            // help compiler for op order.
-                    for (int64_t i = 0; i < RM; ++i)
+            if constexpr (RM <= RN) {
-                        Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
+                V Av[RM];
-                                        load<V>(B + ldb * (jj + j) + l),
+                for (int64_t i = 0; i < RM; ++i) {
-                                        Cv[j][i]);
+                    Av[i] = load<V>(A + lda * (ii + i) + l);
                }
                for (int64_t j = 0; j < RN; ++j) {
                    V Bv = load<V>(B + ldb * (jj + j) + l);
                    for (int64_t i = 0; i < RM; ++i) {
                        Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
                    }
                }
            } else {
                V Bv[RN];
                for (int64_t j = 0; j < RN; ++j) {
                    Bv[j] = load<V>(B + ldb * (jj + j) + l);
                }
                for (int64_t i = 0; i < RM; ++i) {
                    V Av = load<V>(A + lda * (ii + i) + l);
                    for (int64_t j = 0; j < RN; ++j) {
                        Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
                    }
                }
            }
        }
        for (int64_t j = 0; j < RN; ++j)
            for (int64_t i = 0; i < RM; ++i)
                C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
    }
    template <int RM, int RN, int BM>
    NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
        static std::atomic<int64_t> current_chunk;
        GGML_ASSERT(m % (RM * BM) == 0);
        const int64_t ytiles = m / (RM * BM);
        const int64_t xtiles = (n + RN -1) / RN;
        const int64_t jj_RN = (xtiles - (xtiles * RN - n));
        // "round" bloc_size to "nearest" BN
        const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
        const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
        const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
        const int64_t nb_job = ytiles * NB_BN;
        if (params->ith == 0) {
            GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
            std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
        }
        ggml_barrier(params->threadpool);
        int64_t job = params->ith;
        while (job < nb_job) {
            const int64_t ii = (job % ytiles) * RM * BM;
            const int64_t jb =  job / ytiles;
            const int64_t jr0 = BLOC_POS(jb  , jj_BN, SIZE_BN);
            const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
            const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
            const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
            const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
            for (int64_t bi = 0; bi < BM * RM; bi += RM) {
                int64_t jj = jj0;
                for (; jj < jj1; jj += RN) {
                    gemm_bloc<RM, RN>(ii + bi, jj);
                }
                if constexpr (RN > 1) {
                    for (; jj < jj2; jj += RN - 1) {
                        gemm_bloc<RM, RN-1>(ii + bi, jj);
                    }
                }
                GGML_ASSERT(jj == jj2);
            }
            // next step.
            job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
        }
        ggml_barrier(params->threadpool);
        return;
    }
    const ggml_compute_params * params;
    const TA *const A;
    const TB *const B;
    TC *const C;
@ -452,8 +461,6 @@ class tinyBLAS {
    const int64_t lda;
    const int64_t ldb;
    const int64_t ldc;
    const int ith;
    const int nth;
 };
 //////////////////////////////////////////////////////////////////////////////////////////
@ -1659,8 +1666,9 @@ class tinyBLAS_PPC {
 * @param Ctype is GGML data type of `C`
 * @return true if this function was able to service the matmul request
 */
-bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
-                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
+                     const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
                     int64_t ldc, int Atype, int Btype, int Ctype) {
    assert(m >= 0);
    assert(n >= 0);
@ -1668,8 +1676,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    assert(lda >= k);
    assert(ldb >= k);
    assert(ldc >= m);
-    assert(nth > 0);
+    assert(params->nth > 0);
-    assert(ith < nth);
+    assert(params->ith < params->nth);
    // only enable sgemm for prompt processing
    if (n < 2)
@ -1684,37 +1692,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
        if (Btype != GGML_TYPE_F32)
            return false;
 #if defined(__AVX512F__)
-        if (k % 16)
+        tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
            return false;
        tinyBLAS<16, __m512, __m512, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
-            (float *)C, ldc,
+            (float *)C, ldc};
-            ith, nth};
+        return tb.matmul(m, n);
        tb.matmul(m, n);
        return true;
 #elif defined(__AVX__) || defined(__AVX2__)
-        if (k % 8)
+        tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
            return false;
        tinyBLAS<8, __m256, __m256, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
-            (float *)C, ldc,
+            (float *)C, ldc};
-            ith, nth};
+        return tb.matmul(m, n);
        tb.matmul(m, n);
        return true;
 #elif defined(__ARM_NEON)
        if (n < 4)
            return false;
-        if (k % 4)
+        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
            return false;
        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
-            (float *)C, ldc,
+            (float *)C, ldc};
-            ith, nth};
+        return tb.matmul(m, n);
        tb.matmul(m, n);
        return true;
 #elif defined(__MMA__)
        if (k % 8)
            return false;
@ -1722,7 +1718,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const float *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #else
@ -1730,60 +1726,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
    }
    case GGML_TYPE_BF16: {
 #if defined(__AVX512BF16__)
        if (Btype == GGML_TYPE_BF16) {
            tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
                (const ggml_bf16_t *)A, lda,
                (const ggml_bf16_t *)B, ldb,
                (float *)C, ldc};
            return tb.matmul(m, n);
        }
 #elif defined(__AVX512F__)
        if (Btype == GGML_TYPE_BF16) {
            tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
                (const ggml_bf16_t *)A, lda,
                (const ggml_bf16_t *)B, ldb,
                (float *)C, ldc};
            return tb.matmul(m, n);
        }
 #elif defined(__AVX2__)
        if (Btype == GGML_TYPE_BF16) {
            tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
                (const ggml_bf16_t *)A, lda,
                (const ggml_bf16_t *)B, ldb,
                (float *)C, ldc};
            return tb.matmul(m, n);
        }
 #endif
        return false;
    }
    case GGML_TYPE_F16: {
 #if defined(__AVX512F__)
-        if (k % 16)
+        if (Btype == GGML_TYPE_F16) {
-            return false;
+            tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
-        if (Btype != GGML_TYPE_F32)
+                (const ggml_fp16_t *)A, lda,
-            return false;
+                (const ggml_fp16_t *)B, ldb,
-        tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
+                (float *)C, ldc};
-            k, (const ggml_fp16_t *)A, lda,
+            return tb.matmul(m, n);
-            (const float *)B, ldb,
+        }
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
-        if (k % 8)
+        if (Btype == GGML_TYPE_F16) {
-            return false;
+            tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
-        if (Btype != GGML_TYPE_F32)
+                (const ggml_fp16_t *)A, lda,
-            return false;
+                (const ggml_fp16_t *)B, ldb,
-        tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
+                (float *)C, ldc};
-            k, (const ggml_fp16_t *)A, lda,
+            return tb.matmul(m, n);
-            (const float *)B, ldb,
+        }
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
 #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
        if (n < 8)
            return false;
-        if (k % 8)
+        if (Btype == GGML_TYPE_F16) {
-            return false;
+            tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
        if (Btype != GGML_TYPE_F16)
            return false;
        tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
                k, (const ggml_fp16_t *)A, lda,
                (const ggml_fp16_t *)B, ldb,
-            (float *)C, ldc,
+                (float *)C, ldc};
-            ith, nth};
+            return tb.matmul(m, n);
-        tb.matmul(m, n);
+        }
        return true;
 #elif defined(__ARM_NEON) && !defined(_MSC_VER)
-        if (k % 4)
+        if (Btype == GGML_TYPE_F32) {
-            return false;
+            tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
        if (Btype != GGML_TYPE_F32)
            return false;
        tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
                k, (const ggml_fp16_t *)A, lda,
                (const float *)B, ldb,
-            (float *)C, ldc,
+                (float *)C, ldc};
-            ith, nth};
+            return tb.matmul(m, n);
-        tb.matmul(m, n);
+        }
        return true;
 #else
        return false;
 #endif
        return false;
    }
    case GGML_TYPE_Q8_0: {
@ -1794,7 +1801,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_q8_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
@ -1802,7 +1809,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_q8_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #else
@ -1818,7 +1825,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_q4_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
@ -1826,7 +1833,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_q4_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #else
@ -1842,7 +1849,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_q5_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #else
@ -1858,7 +1865,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
            k, (const block_iq4_nl *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
-            ith, nth};
+            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
 #else
@ -1870,6 +1877,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
        return false;
    }
    (void)params;
    (void)m;
    (void)n;
    (void)k;
@ -1879,8 +1887,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    (void)ldb;
    (void)C;
    (void)ldc;
    (void)ith;
    (void)nth;
    (void)Atype;
    (void)Btype;
    (void)Ctype;
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@ -5,8 +5,8 @@
 extern "C" {
 #endif
-bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
+bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
-                     const void *, int64_t, void *, int64_t, int, int,
+                     const void *, int64_t, const void *, int64_t, void *, int64_t,
                     int, int, int);
 #ifdef __cplusplus
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@ -2744,13 +2744,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
    cl_image_format     img_fmt_1d;
    cl_image_desc       img_desc_1d;
    cl_buffer_region    region;
-    cl_mem              A_image1d;
+    cl_mem              A_image1d = nullptr;
-    cl_mem              B_image1d;
+    cl_mem              B_image1d = nullptr;
-    cl_mem              B_sub_buffer;
+    cl_mem              B_sub_buffer = nullptr;
-    cl_mem              C_d;
+    cl_mem              C_d = nullptr;
    // for B transpose
-    cl_mem B_d;
+    cl_mem B_d = nullptr;
-    cl_mem B_d_input_image;
+    cl_mem B_d_input_image = nullptr;
    // <--------------------------------------------> //
    // define matrix dimensions
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@ -11,6 +11,8 @@
 //
 #include "common.hpp"
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 int get_current_device_id() {
@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
                                 const ggml_sycl_op_flatten_t op) try {
    const bool use_src1 = src1 != nullptr;
-
+    if(use_src1)
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+      GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
    // dd = data device
    float * src0_ddf = (float *) src0->data;
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -26,7 +26,11 @@
 #define GGML_COMMON_DECL_SYCL
 #define GGML_COMMON_IMPL_SYCL
 /* suppress warning spam */
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnested-anon-types"
 #include "ggml-common.h"
 #pragma clang diagnostic pop
 void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                     ggml_tensor *tensor) try {
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+    if (tensor->view_src != NULL) {
        assert(tensor->view_src->buffer->buft == buffer->buft);
        tensor->backend = tensor->view_src->backend;
        tensor->extra = tensor->view_src->extra;
        return;
    }
@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
    auto dev_count = ggml_backend_sycl_get_device_count();
    if (device>=dev_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
            device, dev_count-1);
        GGML_ASSERT(device<dev_count);
    }
@ -567,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
    int device = ctx->device;
    if (device>=ggml_sycl_info().device_count or device<0) {
-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
            device, ggml_sycl_info().device_count-1);
        GGML_ASSERT(device<ggml_sycl_info().device_count);
    }
@ -746,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
        }
-        // FIXME: do not crash if cudaMalloc fails
+        // FIXME: do not crash if SYCL Buffer alloc fails
        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
        ggml_sycl_set_device(i);
        const queue_ptr stream = ctx->streams[i];
@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
        }
    }
    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
    tensor->extra = extra;
 }
 catch (sycl::exception const &exc) {
@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
    dpct::memcpy_direction kind;
    char * src_ptr;
-    if (src->backend == GGML_BACKEND_TYPE_CPU) {
+    if (ggml_backend_buffer_is_host(src->buffer)) {
        kind = dpct::host_to_device;
        //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
        src_ptr = (char *) src->data;
        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
-    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
+    } else if (ggml_backend_buffer_is_sycl(src->buffer)) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        // If buffer is a SYCL buffer
        //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
        kind    = dpct::device_to_device;
        src_ptr = (char *) src->data;
    } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
        /*
        If buffer is a SYCL split buffer
        */
        //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
        GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
        kind = dpct::device_to_device;
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
        int id;
@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
    const int nb2 = dst->nb[2];
    const int nb3 = dst->nb[3];
-    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
-    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
+    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
    GGML_ASSERT(!(split && ne02 > 1));
    GGML_ASSERT(!(split && ne03 > 1));
    GGML_ASSERT(!(split && ne02 < ne12));
@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
                                       const ggml_tensor *src1,
                                       ggml_tensor *dst) try {
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
                                             ggml_tensor *dst) try {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
+    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_TENSOR_BINARY_OP_LOCALS
@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
 static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
    GGML_UNUSED(reg);
-    // TODO: update to the current function signature
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
-    //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_sycl_split_buffer_type;
-    //    return (void *)ggml_backend_sycl_split_buffer_type;
+    }
    //}
    // SYCL doesn't support registering host memory, left here for reference
    // "ggml_backend_register_host_buffer"
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -145,6 +145,8 @@ class vk_perf_logger;
 #endif
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
 struct vk_device_struct {
    std::mutex mutex;
@ -202,8 +204,8 @@ struct vk_device_struct {
    vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
    vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
@ -411,7 +413,7 @@ struct vk_op_unary_push_constants {
    uint32_t ne;
    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
-    uint32_t d_offset;
+    uint32_t misalign_offsets;
    float param1; float param2;
    uint32_t ne0_012mp; uint32_t ne0_012L;
    uint32_t ne0_01mp;  uint32_t ne0_01L;
@ -459,7 +461,7 @@ struct vk_op_binary_push_constants {
    uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
    uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
-    uint32_t d_offset;
+    uint32_t misalign_offsets;
    float param1; float param2; int32_t param3;
 };
@ -546,7 +548,7 @@ struct vk_staging_memcpy {
 };
 struct vk_op_upscale_push_constants {
-    uint32_t ne; uint32_t d_offset;
+    uint32_t ne; uint32_t a_offset; uint32_t d_offset;
    uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
    float sf0; float sf1; float sf2; float sf3;
@ -1404,10 +1406,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
        // spec constants and tile sizes for non-quant matmul/matmul_id
        l_warptile = { 256, 128, 256, 64 };
        m_warptile = { 256, 128, 128, 64 };
-        s_warptile = { 128,  32,  16, 64 };
+        s_warptile = { 128,  64,  64, 64 };
        l_wg_denoms = {128, 256, 1 };
        m_wg_denoms = {128, 128, 1 };
-        s_wg_denoms = { 32,  16, 1 };
+        s_wg_denoms = { 64,  64, 1 };
        // spec constants and tile sizes for quant matmul (non-Qi_K)
        l_warptile_mmq = { 256, 128, 256, 64 };
@ -1855,53 +1857,60 @@ static void ggml_vk_load_shaders(vk_device& device) {
    // mul mat vec
-    // AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
+    // the number of rows computed per shader depends on GPU model and quant
-    uint32_t rm = 1;
+    uint32_t rm_stdq = 1;
-    if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
+    uint32_t rm_kq = 2;
-        rm = 2;
+    if (device->vendor_id == VK_VENDOR_ID_AMD) {
        if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN
            rm_stdq = 2;
            rm_kq = 4;
        }
    } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
        rm_stdq = 2;
-    // computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
+    for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32",  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32",  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32_"+std::to_string(i+1),  mul_mat_vec_f16_f32_f32_len,  mul_mat_vec_f16_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f32_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1][i], "mul_mat_vec_q4_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0][i], "mul_mat_vec_q5_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1][i], "mul_mat_vec_q5_1_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0][i], "mul_mat_vec_q8_0_f16_f32_"+std::to_string(i+1), mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K][i], "mul_mat_vec_q2_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K][i], "mul_mat_vec_q3_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i], "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
    }
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {subgroup_size_16, 2*rm}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
    // dequant shaders
    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@ -2012,11 +2021,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
    if (device->float_controls_rte_fp16) {
-        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
    } else {
-        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {512, 1, 1}, { device->subgroup_size }, 1, true);
    }
    ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
@ -2887,9 +2896,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
    return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc;
 }
-static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) {
+static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type, uint32_t num_cols) {
    VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()");
    GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16);
    GGML_ASSERT(num_cols >= 1 && num_cols <= mul_mat_vec_max_cols);
    switch (a_type) {
        case GGML_TYPE_F32:
@ -2910,7 +2920,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
            return nullptr;
    }
-    return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type];
+    return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type][num_cols-1] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type][num_cols-1];
 }
 static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) {
@ -3205,8 +3215,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
        GGML_ABORT("fatal error");
    }
    // Check if src is pinned memory
-    vk_buffer buf;
+    vk_buffer buf = nullptr;
-    size_t buf_offset;
+    size_t buf_offset = 0;
    ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
    const uint64_t ne0 = tensor->ne[0];
@ -3269,7 +3279,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
    VkBufferCopy buf_copy{ 0, offset, copy_size };
    ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
    for (uint64_t i3 = 0; i3 < ne3; i3++) {
        for (uint64_t i2 = 0; i2 < ne2; i2++) {
@ -3302,7 +3312,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
    }
    // Check if src is pinned memory
    vk_buffer buf = nullptr;
-    size_t buf_offset;
+    size_t buf_offset = 0;
    ggml_vk_host_get(dst->device, src, buf, buf_offset);
    if (buf != nullptr) {
@ -3344,7 +3354,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
        copy_size};
    ggml_vk_sync_buffers(subctx);
-    vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy);
+    vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
    if (width == spitch) {
        deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
@ -3400,7 +3410,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
    // Check if dst is pinned memory
    vk_buffer buf = nullptr;
-    size_t buf_offset;
+    size_t buf_offset = 0;
    ggml_vk_host_get(src->device, dst, buf, buf_offset);
    std::vector<vk::BufferCopy> slices(1);
@ -3480,7 +3490,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
    VkBufferCopy bc{ src_offset, dst_offset, size };
-    vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc);
+    vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
 }
 static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
@ -3732,9 +3742,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
    size_t qy_buf_offset = 0;
    bool src0_uma = false;
@ -3920,8 +3930,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
    const uint64_t ne12 = src1->ne[2];
    const uint64_t ne13 = src1->ne[3];
    GGML_ASSERT(ne11 == 1);
    const uint64_t ne20 = dst->ne[0];
    const uint64_t ne21 = dst->ne[1];
    const uint64_t ne22 = dst->ne[2];
@ -3930,13 +3938,18 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
    const uint64_t r2 = ne12 / ne02;
    const uint64_t r3 = ne13 / ne03;
    // batch_n indicates that we need to compute a few vector results, and this assumes
    // ne12 and ne13 are 1. It overloads the batch_strides to hold the row strides.
    GGML_ASSERT(ne11 == 1 || ne12 * ne13 == 1);
    bool batch_n = ne11 > 1;
    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
    size_t qy_buf_offset = 0;
    bool src0_uma = false;
@ -3980,7 +3993,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
    } else {
        to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
    }
-    vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type);
+    vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type, ne11);
    GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
    GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
    GGML_ASSERT(dmmv != nullptr);
@ -4052,8 +4065,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
        ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
    }
-    uint32_t stride_batch_x = ne00*ne01;
+    // For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
-    uint32_t stride_batch_y = ne10*ne11;
+    uint32_t stride_batch_x = batch_n ? 0 : ne00*ne01;
    uint32_t stride_batch_y = batch_n ? ne10 : (ne10*ne11);
    uint32_t stride_batch_d = batch_n ? ne20 : (ne20*ne21);
    if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
        stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
@ -4076,7 +4091,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
    // compute
    const vk_mat_vec_push_constants pc = {
        (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01,
-        stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21),
+        stride_batch_x, stride_batch_y, stride_batch_d,
        (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3,
    };
    ggml_vk_sync_buffers(subctx);
@ -4112,7 +4127,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
    size_t qy_buf_offset = 0;
    bool src1_uma = false;
@ -4256,7 +4271,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, c
    } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1 &&
               !ggml_is_permuted(src0) && !ggml_is_permuted(src1)) {
        ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst, dryrun);
-    } else if (dst->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
+    // mul_mat_vec supports batching ne12*ne13 when ne11==1, or treating ne11 as the batch size (up to four)
    // when ne12 and ne13 are one.
    } else if ((dst->ne[1] == 1 || (dst->ne[1] <= mul_mat_vec_max_cols && src1->ne[2] * src1->ne[3] == 1)) &&
               (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
        ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst, dryrun);
    } else {
        ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst, dryrun);
@ -4300,11 +4318,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
    ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
    size_t qy_buf_offset = 0;
-    vk_buffer d_ids;
+    vk_buffer d_ids = nullptr;
    size_t ids_buf_offset = 0;
    bool src0_uma = false;
@ -4505,11 +4523,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
    ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context;
    ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
-    vk_buffer d_Qx;
+    vk_buffer d_Qx = nullptr;
    size_t qx_buf_offset = 0;
-    vk_buffer d_Qy;
+    vk_buffer d_Qy = nullptr;
    size_t qy_buf_offset = 0;
-    vk_buffer d_ids;
+    vk_buffer d_ids = nullptr;
    size_t ids_buf_offset = 0;
    bool src0_uma = false;
@ -4768,8 +4786,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    ggml_vk_sync_buffers(subctx);
-    vk_buffer d_Q, d_K, d_V, d_D, d_M;
+    vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
-    uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset;
+    size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
    bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false;
@ -5071,6 +5089,57 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    }
 }
 static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_tensor * t)
 {
    return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
 }
 template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
    GGML_UNUSED(p);
    GGML_UNUSED(src0);
    GGML_UNUSED(src1);
    GGML_UNUSED(src2);
    GGML_UNUSED(dst);
    static_assert(!std::is_const<T>::value, "unexpected type");
    GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
    GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
    GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
    GGML_ASSERT(!dst  || get_misalign_bytes(ctx, dst) == 0);
 }
 template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
    p.misalign_offsets = (a_offset << 16) | d_offset;
    GGML_UNUSED(src1);
    GGML_UNUSED(src2);
 }
 template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
    const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
    GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
    p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
    GGML_UNUSED(src2);
 }
 template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
    const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
    const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
    p.a_offset = a_offset;
    p.d_offset = d_offset;
    GGML_UNUSED(src1);
    GGML_UNUSED(src2);
 }
 template<typename PC>
 static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
    VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
@ -5174,8 +5243,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    }
    GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
    GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY);  // NOLINT
    if(!src0_uma) {
        d_X = src0_buf_ctx->dev_buffer;
        x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
@ -5191,6 +5259,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
        GGML_ASSERT(d_Z != nullptr);
    }
    // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
    init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
    x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
    y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
    z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
    d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
    if (op_supports_incontiguous) {
        x_sz = ggml_nbytes(src0);
@ -5378,7 +5452,6 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t src1_type_size = ggml_type_size(src1->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
@ -5390,7 +5463,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size,
-        d_offset,
+        0,
        0.0f, 0.0f, offset,
    }, dryrun);
 }
@ -5474,8 +5547,8 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
    ggml_vk_sync_buffers(subctx);
-    vk_buffer d_D, d_K, d_V, d_R, d_TF, d_TD, d_State;
+    vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr;
-    uint64_t k_offset, v_offset, r_offset, tf_offset, td_offset, state_offset, dst_offset;
+    size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0;
    bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false;
    if (ctx->device->uma) {
@ -5594,7 +5667,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
    const float sf3 = (float)dst->ne[3] / src0->ne[3];
    ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
-        (uint32_t)ggml_nelements(dst), 0,
+        (uint32_t)ggml_nelements(dst), 0, 0,
        (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
        sf0, sf1, sf2, sf3,
@ -5704,13 +5777,12 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co
 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
-        d_offset,
+        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp
@ -21,9 +21,9 @@ void main() {
    get_indices(idx, i00, i01, i02, i03);
    if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
    } else {
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]));
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]));
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp
@ -22,7 +22,7 @@ void main() {
        uint i00, i01, i02, i03;
        get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
        idx += num_threads;
    }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp
@ -12,6 +12,6 @@ void main() {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp
@ -30,12 +30,12 @@ void main() {
    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : data_b[get_boffset() + src1_idx]);
 #else
    if (is_src0) {
-        data_d[p.d_offset + dst_idx] = data_a[src0_idx];
+        data_d[get_doffset() + dst_idx] = data_a[get_aoffset() + src0_idx];
    } else {
-        data_d[p.d_offset + dst_idx] = data_b[src1_idx];
+        data_d[get_doffset() + dst_idx] = data_b[get_boffset() + src1_idx];
    }
 #endif
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp
@ -19,9 +19,9 @@ void main() {
    if (idx + (num_iter-1)*num_threads < p.ne) {
        [[unroll]] for (uint i = 0; i < num_iter; ++i) {
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
 #else
-            data_d[p.d_offset + idx] = data_a[idx];
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
 #endif
            idx += num_threads;
        }
@ -32,9 +32,9 @@ void main() {
            }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-            data_d[p.d_offset + idx] = D_TYPE(data_a[idx]);
+            data_d[get_doffset() + idx] = D_TYPE(data_a[get_aoffset() + idx]);
 #else
-            data_d[p.d_offset + idx] = data_a[idx];
+            data_d[get_doffset() + idx] = data_a[get_aoffset() + idx];
 #endif
            idx += num_threads;
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp
@ -13,8 +13,8 @@ void main() {
    }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
 #else
-    data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
+    data_d[get_doffset() + dst_idx(idx)] = data_a[get_aoffset() + src0_idx(idx)];
 #endif
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp
@ -12,6 +12,6 @@ void main() {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2
    const float16_t d = bl.block.d;
    const uint idx = coordInBlock[1];
    const uint shift = (idx & 0x10) >> 2;
-    uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1];
+    uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]);
    qs >>= shift;
-    qs &= 0xF;
+    qs &= 0x0F0F;
    qs = unpack8(qs)[idx & 1];
    float16_t ret = (float16_t(qs) - float16_t(8)) * d;
    return ret;
 }
@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
   block_q4_K block;
 };
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 {
   block_q4_K_packed16 block;
 };
 float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
    const uint idx = coordInBlock[1];
    const uint iqs = idx;
-    const uint n = iqs / 64;                   // 0,1,2,3
+    const uint b = (idx & 0x20) >> 5;            // 0,1
    const uint b = (iqs % 64) / 32;            // 0,1
    const uint is = (idx & 0xE0) >> 5;         // 0..7
    const uint qsi = n * 32 + (iqs % 32);      // 0..127
    const f16vec2 loadd = bl.block.d;
@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2
    const float16_t d = loadd.x * float16_t(sc);
    const float16_t m = loadd.y * float16_t(mbyte);
-    uint32_t dmask = 0xF << (b * 4);
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
    qs = (qs >> (b * 4)) & 0x0F0F;
    qs = unpack8(qs)[idx & 1];
-    float16_t ret = d * float16_t((bl.block.qs[qsi    ] & dmask) >> (b * 4)) - m;
+    float16_t ret = d * float16_t(qs) - m;
    return ret;
 }
@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
   block_q5_K block;
 };
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 {
   block_q5_K_packed16 block;
 };
 float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
    const uint idx = coordInBlock[1];
    const uint iqs = idx;
-    const uint n = iqs / 64;                   // 0,1,2,3
+    const uint b = (idx & 0x20) >> 5;          // 0,1
    const uint b = (iqs % 64) / 32;            // 0,1
    const uint is = (idx & 0xE0) >> 5;         // 0..7
    const uint qsi = n * 32 + (iqs % 32);      // 0..127
    const uint qhi = (iqs % 32);               // 0..31
-    const uint8_t hm = uint8_t(1 << (iqs / 32));
+    const uint32_t hm = 0x0101 << is;
    const f16vec2 loadd = bl.block.d;
@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2
    const float16_t d = loadd.x * float16_t(sc);
    const float16_t m = loadd.y * float16_t(mbyte);
-    uint32_t dmask = 0xF << (b * 4);
+    uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
    qh = qh & hm;
    qh = unpack8(qh)[idx & 1];
-    float16_t ret = d * (float16_t((bl.block.qs[qsi    ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi    ] & hm) != 0 ? 16 : 0)) - m;
+    uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
    qs = (qs >> (b * 4)) & 0x0F0F;
    qs = unpack8(qs)[idx & 1];
    float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
    return ret;
 }
@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_
   block_q6_K block;
 };
 layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 {
   block_q6_K_packed16 block;
 };
 float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl);
    const uint idx = coordInBlock[1];
    const uint iqs = idx;
-    const uint n = iqs / 128;                   // 0,1
+    const uint b = (idx & 0x40) >> 6;           // 0,1
-    const uint b = (iqs % 128) / 64;            // 0,1
+    const uint qhshift = (idx & 0x60) >> 4;    // 0,2,4,6
-    const uint is_b = (iqs % 32) / 16;          // 0,1
+    const uint is = (idx & 0xF0) >> 4;          // 0..15
    const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6
    const uint is = 8 * n + qhshift + is_b;     // 0..15
    const uint qsi = n * 64 + (iqs % 64);       // 0..127
    const uint qhi = n * 32 + (iqs % 32);       // 0..63
    const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]);
-    float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi    ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi    ] >> qhshift) & 3) << 4)) - 32);
+    uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]);
    ql = (ql >> (b * 4)) & 0x0F0F;
    uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
    qh = ((qh >> qhshift) & 0x0303) << 4;
    int q = unpack8(ql | qh)[idx & 1];
    float16_t ret = dscale * float16_t(q - 32);
    return ret;
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/div.comp
@ -20,7 +20,7 @@ void main() {
        uint i00, i01, i02, i03;
        get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) / FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
        idx += num_threads;
    }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
-    uint d_offset;
+    uint misalign_offsets;
    float param1; float param2; int param3;
 } p;
@ -22,6 +22,10 @@ uint get_idx() {
    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 uint get_aoffset() { return p.misalign_offsets >> 16; }
 uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; }
 uint get_doffset() { return p.misalign_offsets & 0xFF; }
 // mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
 uint fastmod(uint a, uint b) {
    if ((b & (b-1)) == 0) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp
@ -6,7 +6,7 @@ layout (push_constant) uniform parameter
    uint ne;
    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
-    uint d_offset;
+    uint misalign_offsets;
    float param1; float param2;
    uint ne0_012mp; uint ne0_012L;
@ -24,6 +24,9 @@ uint get_idx() {
    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 uint get_aoffset() { return p.misalign_offsets >> 16; }
 uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
 // see init_fastdiv_values in ggml-vulkan.cpp
 uint fastdiv(uint n, uint mp, uint L) {
    uint msbs, lsbs;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@ -15,10 +15,10 @@ void main() {
        return;
    }
-    const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
+    const uint i01 = data_b[get_boffset() + i10*p.nb10 + i11*p.nb11 + i12*p.nb12];
-    const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
+    const uint a_offset = get_aoffset() + i01*p.nb01 + i11*p.nb02 + i12*p.nb03;
-    const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
+    const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
    data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@ -2,6 +2,7 @@
 #extension GL_EXT_shader_16bit_storage : require
 #extension GL_EXT_spirv_intrinsics: enable
 #extension GL_EXT_control_flow_attributes : require
 #if RTE16
 spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
@ -23,40 +24,64 @@ layout (push_constant) uniform parameter
 #include "types.comp"
-#define BLOCK_SIZE 256
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
-layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+const uint NUM_ITER = 512 / BLOCK_SIZE;
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
-    const uint i = gl_GlobalInvocationID.x;
+    const uint gidx = gl_GlobalInvocationID.x;
-    if (i >= p.pelements) {
+
-        return;
+    const uint oh = gl_GlobalInvocationID.y;
    const uint batch = gl_GlobalInvocationID.z / p.IC;
    const uint ic = gl_GlobalInvocationID.z % p.IC;
    A_TYPE values[NUM_ITER];
    uint offset_dst[NUM_ITER];
    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
        values[idx] = A_TYPE(0);
    }
    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
        const uint i = gidx * NUM_ITER + idx;
        const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
        const uint kx = i / ksize;
        const uint kd = kx * ksize;
        const uint ky = (i - kd) / p.OW;
        const uint ix = i % p.OW;
    const uint oh = gl_GlobalInvocationID.y;
    const uint batch = gl_GlobalInvocationID.z / p.IC;
    const uint ic = gl_GlobalInvocationID.z % p.IC;
        const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
        const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
-    const uint offset_dst =
+        offset_dst[idx] =
            ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
            (ic * (p.KW * p.KH) + ky * p.KW + kx);
-    if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
+        if (i >= p.pelements) {
-        data_d[offset_dst] = D_TYPE(0.0f);
+            continue;
    } else {
        const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
        data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
        }
        if (iih < p.IH && iiw < p.IW) {
            const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
            values[idx] = data_a[offset_src + iih * p.IW + iiw];
        }
    }
    [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
        const uint i = gidx * NUM_ITER + idx;
        if (i >= p.pelements) {
            continue;
        }
        data_d[offset_dst[idx]] = D_TYPE(values[idx]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp
@ -20,7 +20,7 @@ void main() {
        uint i00, i01, i02, i03;
        get_indices(idx, i00, i01, i02, i03);
-        data_d[p.d_offset + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[src1_idx(i00, i01, i02, i03)]));
+        data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) * FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]));
        idx += num_threads;
    }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@ -9,9 +9,6 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
 layout (constant_id = 1) const uint NUM_ROWS = 1;
 #if !defined(DATA_A_F32) && !defined(DATA_A_F16)
 #define K_PER_ITER 8
 #else
@ -21,23 +18,22 @@ layout (constant_id = 1) const uint NUM_ROWS = 1;
 uint a_offset, b_offset, d_offset, y_offset;
-shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];
+void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
 void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i, bool lastiter)
 {
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        const uint col = i*BLOCK_SIZE + K_PER_ITER*tid;
        const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
        const uint iybs = col - col%QUANT_K; // y block start index
 #if K_PER_ITER == 8
 #if QUANT_R == 2
-    const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
+        const B_TYPE_VEC4 bv02 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4];
-    const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
+        const B_TYPE_VEC4 bv13 = data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs + y_offset) / 4];
        const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
        const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
 #else
-    const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
+        const vec4 bv0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4]);
-    const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
+        const vec4 bv1 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + iybs + iqs) / 4 + 1]);
 #endif
 #else
        // Check if the second of the pair of elements is OOB, and don't fetch B or
@ -48,9 +44,9 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
        const bool OOB = lastiter && (iybs + iqs + y_offset >= p.ncols);
        FLOAT_TYPE b0 = 0, b1 = 0;
-    b0 = FLOAT_TYPE(data_b[b_offset + iybs + iqs]);
+        b0 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs]);
        if (!OOB) {
-        b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
+            b1 = FLOAT_TYPE(data_b[j*p.batch_stride_b + b_offset + iybs + iqs + y_offset]);
        }
 #endif
        uint ibi = first_row*p.ncols;
@ -75,17 +71,18 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
            if (dm.y == 0)
                rowtmp *= dm.x;
-        temp[n] += rowtmp;
+            temp[j][n] += rowtmp;
 #else
            const vec2 v = dequantize(ib, iqs, a_offset);
            // matrix multiplication
-        temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
+            temp[j][n] = fma(FLOAT_TYPE(v.x), b0, temp[j][n]);
            if (!OOB) {
-            temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
+                temp[j][n] = fma(FLOAT_TYPE(v.y), b1, temp[j][n]);
            }
 #endif
        }
    }
 }
 void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
@ -96,10 +93,12 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
-    FLOAT_TYPE temp[NUM_ROWS];
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
-    for (uint i = 0; i < NUM_ROWS; ++i) {
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        temp[i] = FLOAT_TYPE(0);
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
@ -131,24 +130,7 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
        i++;
    }
-    // sum up partial sums and write back result
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        tmpsh[n][tid] = temp[n];
    }
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
        if (tid < s) {
            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
                tmpsh[n][tid] += tmpsh[n][tid + s];
            }
        }
        barrier();
    }
    if (tid == 0) {
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
        }
    }
 }
 void main() {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp
@ -83,3 +83,36 @@ void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
            batch_idx * p.batch_stride_d;
 #endif
 }
 layout (constant_id = 0) const uint BLOCK_SIZE = 32;
 layout (constant_id = 1) const uint NUM_ROWS = 1;
 layout (constant_id = 2) const uint NUM_COLS = 1;
 shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
 void reduce_result(const in FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
    // sum up partial sums and write back result
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            tmpsh[j][n][tid] = temp[j][n];
        }
    }
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
        if (tid < s) {
            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
                [[unroll]] for (uint n = 0; n < num_rows; ++n) {
                    tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
                }
            }
        }
        barrier();
    }
    if (tid == 0) {
        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
                data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
            }
        }
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -5,22 +5,11 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    if (row >= p.stride_d) {
        return;
    }
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
    const uint num_blocks_per_row = p.ncols / QUANT_K;
    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
@ -38,24 +27,23 @@ void main() {
    const uint s_offset = 8*v_im;
    const uint y_offset = 128*v_im + l0;
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y_idx = i * QUANT_K + y_offset;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            f16vec2 d = data_a[ib0 + i].d;
            const FLOAT_TYPE dall = d.x;
            const FLOAT_TYPE dmin = d.y;
        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
            uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
            uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
@ -74,6 +62,16 @@ void main() {
            uvec2 qs0 =  uvec2(unpack8(qs0_u16));
            uvec2 qs16 = uvec2(unpack8(qs16_u16));
            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
                B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
                B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
                B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
                B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
                B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
                B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
                B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
                B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
                FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
                FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
                [[unroll]] for (int l = 0; l < 2; ++l) {
@ -94,20 +92,24 @@ void main() {
                           fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
                           fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
                }
-        temp = fma(dall, sum1, fma(-dmin, sum2, temp));
+                temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
            }
        }
    }
-    tmp[gl_LocalInvocationID.x] = temp;
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
-    // sum up partial sums and write back result
+void main() {
-    barrier();
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+
-        if (tid < s) {
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
-            tmp[tid] += tmp[tid + s];
+    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
-        barrier();
+        compute_outputs(first_row, p.stride_d - first_row);
    }
    if (tid == 0) {
        data_d[d_offset + row] = D_TYPE(tmp[0]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@ -5,22 +5,11 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    if (row >= p.stride_d) {
        return;
    }
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
    const uint num_blocks_per_row = p.ncols / QUANT_K;
    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
@ -39,24 +28,23 @@ void main() {
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 128*v_im + l0;
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    const uint s_shift = 4 * v_im;
    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y_idx = i * QUANT_K + y_offset;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];
            uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
            uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
            uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
@ -70,6 +58,17 @@ void main() {
            u8vec2 s8 = unpack8(s8_16);
            u8vec2 s10 = unpack8(s10_16);
            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
                B_TYPE_VEC2 b0 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0];
                B_TYPE_VEC2 b16 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8];
                B_TYPE_VEC2 b32 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16];
                B_TYPE_VEC2 b48 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24];
                B_TYPE_VEC2 b64 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32];
                B_TYPE_VEC2 b80 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40];
                B_TYPE_VEC2 b96 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48];
                B_TYPE_VEC2 b112 = data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56];
                FLOAT_TYPE sum = FLOAT_TYPE(0.0);
                [[unroll]] for (int l = 0; l < 2; ++l) {
                    sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
@ -81,20 +80,24 @@ void main() {
                          fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
                          fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
                }
-        temp = fma(d, sum, temp);
+                temp[j][n] = fma(d, sum, temp[j][n]);
            }
        }
    }
-    tmp[gl_LocalInvocationID.x] = temp;
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
-    // sum up partial sums and write back result
+void main() {
-    barrier();
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+
-        if (tid < s) {
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
-            tmp[tid] += tmp[tid + s];
+    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
-        barrier();
+        compute_outputs(first_row, p.stride_d - first_row);
    }
    if (tid == 0) {
        data_d[d_offset + row] = D_TYPE(tmp[0]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@ -6,22 +6,11 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    if (row >= p.stride_d) {
        return;
    }
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
    const uint num_blocks_per_row = p.ncols / QUANT_K;
    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
@ -42,12 +31,20 @@ void main() {
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 64*v_im + l0;
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y1_idx = i * QUANT_K + y_offset;
        const uint y2_idx = y1_idx + 128;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            f16vec2 d = data_a[ib0 + i].d;
            const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
            const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
@ -98,10 +95,11 @@ void main() {
            const uint32_t q4_14 = qs64_hi4.z;
            const uint32_t q4_15 = qs64_hi4.w;
-        B_TYPE_VEC4 by10 =  data_b_v4[(b_offset + y1_idx) / 4];
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8];
+                B_TYPE_VEC4 by10 =  data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4];
-        B_TYPE_VEC4 by20 =  data_b_v4[(b_offset + y2_idx) / 4];
+                B_TYPE_VEC4 by132 = data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8];
-        B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8];
+                B_TYPE_VEC4 by20 =  data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4];
                B_TYPE_VEC4 by232 = data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8];
                const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x),      q4_0,  fma(FLOAT_TYPE(by10.y),  q4_1,  fma(FLOAT_TYPE(by10.z),  q4_2,  FLOAT_TYPE(by10.w) *  q4_3)));
                const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x),     q4_4,  fma(FLOAT_TYPE(by132.y), q4_5,  fma(FLOAT_TYPE(by132.z), q4_6,  FLOAT_TYPE(by132.w) * q4_7)));
@ -112,20 +110,24 @@ void main() {
                    fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
                    fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
                    fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6,     FLOAT_TYPE(by232.w) * sc7)))))))))))))));
-        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
+                temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
            }
        }
    }
-    tmp[gl_LocalInvocationID.x] = temp;
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
-    // sum up partial sums and write back result
+void main() {
-    barrier();
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+
-        if (tid < s) {
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
-            tmp[tid] += tmp[tid + s];
+    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
-        barrier();
+        compute_outputs(first_row, p.stride_d - first_row);
    }
    if (tid == 0) {
        data_d[d_offset + row] = D_TYPE(tmp[0]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@ -6,22 +6,11 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    if (row >= p.stride_d) {
        return;
    }
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
    const uint num_blocks_per_row = p.ncols / QUANT_K;
    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
@ -39,12 +28,20 @@ void main() {
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 64*v_im + l0;
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y1_idx = i * QUANT_K + y_offset;
        const uint y2_idx = y1_idx + 128;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            f16vec2 d = data_a[ib0 + i].d;
            const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
            const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
@ -107,14 +104,15 @@ void main() {
            const uint32_t q4_14 = qs64_80_hi4.z;
            const uint32_t q4_15 = qs64_80_hi4.w;
-        B_TYPE_VEC2 by10 =  data_b_v2[(b_offset + y1_idx) / 2];
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8];
+                B_TYPE_VEC2 by10 =  data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2];
-        B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16];
+                B_TYPE_VEC2 by116 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8];
-        B_TYPE_VEC2 by148 = data_b_v2[(b_offset + y1_idx) / 2 + 24];
+                B_TYPE_VEC2 by132 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16];
-        B_TYPE_VEC2 by20 =  data_b_v2[(b_offset + y2_idx) / 2];
+                B_TYPE_VEC2 by148 = data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24];
-        B_TYPE_VEC2 by216 = data_b_v2[(b_offset + y2_idx) / 2 + 8];
+                B_TYPE_VEC2 by20 =  data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2];
-        B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16];
+                B_TYPE_VEC2 by216 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8];
-        B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24];
+                B_TYPE_VEC2 by232 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16];
                B_TYPE_VEC2 by248 = data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24];
                const FLOAT_TYPE sx =
                  fma(FLOAT_TYPE(by10.x), q4_0,
@ -141,20 +139,24 @@ void main() {
                  fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
                  fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
                      (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
-        temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp));
+                temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
            }
        }
    }
-    tmp[gl_LocalInvocationID.x] = temp;
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
-    // sum up partial sums and write back result
+void main() {
-    barrier();
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+
-        if (tid < s) {
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
-            tmp[tid] += tmp[tid + s];
+    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
-        barrier();
+        compute_outputs(first_row, p.stride_d - first_row);
    }
    if (tid == 0) {
        data_d[d_offset + row] = D_TYPE(tmp[0]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@ -6,22 +6,11 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 0) const uint BLOCK_SIZE = 32;
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    if (row >= p.stride_d) {
        return;
    }
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
    const uint num_blocks_per_row = p.ncols / QUANT_K;
    const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
@ -42,11 +31,19 @@ void main() {
    const uint s_offset  =  8*v_im + is;
    const uint y_offset = 128*v_im + l0;
-    FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp
+    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }
    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y_idx = i * QUANT_K + y_offset;
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
            FLOAT_TYPE scales[4];
@ -79,10 +76,11 @@ void main() {
            uvec4 q2 = uvec4(unpack8(q2_u32));
            uvec4 q3 = uvec4(unpack8(q3_u32));
-        B_TYPE_VEC4 by0  = data_b_v4[(b_offset + y_idx) / 4];
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8];
+                B_TYPE_VEC4 by0  = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4];
-        B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16];
+                B_TYPE_VEC4 by32 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8];
-        B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24];
+                B_TYPE_VEC4 by64 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16];
                B_TYPE_VEC4 by96 = data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24];
                FLOAT_TYPE sum = FLOAT_TYPE(0.0);
                [[unroll]] for (int l = 0; l < 4; ++l) {
@ -91,20 +89,24 @@ void main() {
                          fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
                          fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
                }
-        temp += sum * d;
+                temp[j][n] += sum * d;
            }
        }
    }
-    tmp[gl_LocalInvocationID.x] = temp;
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
-    // sum up partial sums and write back result
+}
-    barrier();
+void main() {
-    [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
-        if (tid < s) {
+
-            tmp[tid] += tmp[tid + s];
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
-        barrier();
+        compute_outputs(first_row, p.stride_d - first_row);
    }
    if (tid == 0) {
        data_d[d_offset + row] = D_TYPE(tmp[0]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp
@ -24,5 +24,5 @@ void main() {
    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
-    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
+    data_d[get_doffset() + dst_idx] = D_TYPE(is_src0 ? data_a[get_aoffset() + src0_idx] : 0.0f);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp
@ -22,5 +22,5 @@ void main() {
        return;
    }
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(data_a[get_aoffset() + src0_idx_mod(idx)]);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp
@ -18,7 +18,7 @@ void main() {
            continue;
        }
-        data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1));
+        data_d[get_doffset() + idx] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + idx]) * FLOAT_TYPE(p.param1));
        idx += num_threads;
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp
@ -12,6 +12,6 @@ void main() {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp
@ -12,6 +12,6 @@ void main() {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
+    data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
@ -2,7 +2,7 @@
 layout (push_constant) uniform parameter
 {
-    uint ne; uint d_offset;
+    uint ne; uint a_offset; uint d_offset;
    uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13;
    float sf0; float sf1; float sf2; float sf3;
@ -32,5 +32,5 @@ void main() {
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
-    data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
+    data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
    }
    PROCESS_INFORMATION pi;
-    STARTUPINFOA si = { sizeof(STARTUPINFOA) };
+    STARTUPINFOA si = {};
    si.cb = sizeof(STARTUPINFOA);
    si.dwFlags = STARTF_USESTDHANDLES;
    si.hStdOutput = stdout_write;
    si.hStdError = stderr_write;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -221,6 +221,7 @@ class GGUFType:
 class MODEL_ARCH(IntEnum):
    LLAMA            = auto()
    DECI             = auto()
    FALCON           = auto()
    BAICHUAN         = auto()
    GROK             = auto()
@ -402,6 +403,7 @@ class MODEL_TENSOR(IntEnum):
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLAMA:            "llama",
    MODEL_ARCH.DECI:             "deci",
    MODEL_ARCH.FALCON:           "falcon",
    MODEL_ARCH.BAICHUAN:         "baichuan",
    MODEL_ARCH.GROK:             "grok",
@ -602,6 +604,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.DECI: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.GROK: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@ -1448,6 +1470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.DECI: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.BAICHUAN: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -198,6 +198,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.dense",                     # falcon
            "h.{bid}.self_attention.dense",                                 # bloom
            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2
            "model.layers.{bid}.self_attn.linear_attn",                     # deci
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
 cursor = connection.cursor()
 builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
 commit_short_len = len(builds[0][0])
 try:
    repo = git.Repo(".", search_parent_directories=True)
 except git.InvalidGitRepositoryError:
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
    seen_hexsha8 = set()
    while heap:
        depth, current_commit = heapq.heappop(heap)
-        current_hexsha8 = commit.hexsha[:8]
+        current_hexsha8 = commit.hexsha[:commit_short_len]
        if (current_hexsha8,) in builds:
            return current_hexsha8
        for parent in commit.parents:
-            parent_hexsha8 = parent.hexsha[:8]
+            parent_hexsha8 = parent.hexsha[:commit_short_len]
            if parent_hexsha8 not in seen_hexsha8:
                seen_hexsha8.add(parent_hexsha8)
                heapq.heappush(heap, (depth + 1, parent))
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
    while unvisited:
        current_commit = unvisited.pop(0)
-        visited.append(current_commit.hexsha[:8])
+        visited.append(current_commit.hexsha[:commit_short_len])
        for parent in current_commit.parents:
-            if parent.hexsha[:8] not in visited:
+            if parent.hexsha[:commit_short_len] not in visited:
                unvisited.append(parent)
    return visited
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
    if repo is None:
        return hexsha8
    for h in repo.heads:
-        if h.commit.hexsha[:8] == hexsha8:
+        if h.commit.hexsha[:commit_short_len] == hexsha8:
            return h.name
    for t in repo.tags:
-        if t.commit.hexsha[:8] == hexsha8:
+        if t.commit.hexsha[:commit_short_len] == hexsha8:
            return t.name
    return hexsha8
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
        return None
    for h in repo.heads:
        if h.name == name:
-            return h.commit.hexsha[:8]
+            return h.commit.hexsha[:commit_short_len]
    for t in repo.tags:
        if t.name == name:
-            return t.commit.hexsha[:8]
+            return t.commit.hexsha[:commit_short_len]
    for c in repo.iter_commits("--all"):
-        if c.hexsha[:8] == name[:8]:
+        if c.hexsha[:commit_short_len] == name[:commit_short_len]:
-            return c.hexsha[:8]
+            return c.hexsha[:commit_short_len]
    return None
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@ -26,7 +26,7 @@ function has_cmd {
 }
 if has_cmd wget; then
-    cmd="wget -q --show-progress -c -O %s/%s %s"
+    cmd="wget -q -c -O %s/%s %s"
 elif has_cmd curl; then
    cmd="curl -C - -f --output-dir %s -o %s -L %s"
 else
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
    return grammar->stacks;
 }
-void llama_grammar_accept(
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
-        const llama_grammar_rules  & rules,
+    llama_grammar_stacks stacks_new;
-        const llama_grammar_stacks & stacks,
+    stacks_new.reserve(grammar->stacks.size());
        const uint32_t               chr,
              llama_grammar_stacks & stacks_new) {
    stacks_new.clear();
    stacks_new.reserve(stacks.size());
-    for (const auto & stack : stacks) {
+    for (const auto & stack : grammar->stacks) {
        if (stack.empty()) {
            continue;
        }
@ -844,9 +840,11 @@ void llama_grammar_accept(
            if (!llama_grammar_is_end_of_sequence(pos)) {
                new_stack.push_back(pos);
            }
-            llama_grammar_advance_stack(rules, new_stack, stacks_new);
+            llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
        }
    }
    grammar->stacks = std::move(stacks_new);
 }
 llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
+    llama_grammar * result = new llama_grammar {
        grammar.vocab,
        grammar.rules,
        grammar.stacks,
        grammar.partial_utf8,
    };
    // redirect elements in stacks to point to new rules
    for (size_t is = 0; is < result->stacks.size(); is++) {
@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
    const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
    const auto & code_points = decoded.first;
    llama_grammar_stacks stacks_new;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
+        llama_grammar_accept(&grammar, *it);
        grammar.stacks = std::move(stacks_new);
    }
    grammar.partial_utf8 = decoded.second;
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@ -58,6 +58,7 @@ using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
 // TODO: remove, needed for tests atm
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
@ -65,11 +66,7 @@ const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
-void llama_grammar_accept(
+void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
        const llama_grammar_rules  & rules,
        const llama_grammar_stacks & stacks,
                          uint32_t   chr,
              llama_grammar_stacks & stacks_new);
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_rules      & rules,
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
 }
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
-    return vocab.special_bos_id;
+    return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
 }
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -45,7 +45,7 @@ struct llama_vocab {
    id special_unk_id  = 0;
    id special_sep_id  = LLAMA_TOKEN_NULL;
    id special_pad_id  = LLAMA_TOKEN_NULL;
-    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
    id special_mask_id = LLAMA_TOKEN_NULL;
    id linefeed_id = 13;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
    LLM_ARCH_GROK,
@ -203,6 +204,7 @@ enum llm_arch {
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,            "llama"            },
    { LLM_ARCH_DECI,             "deci"            },
    { LLM_ARCH_FALCON,           "falcon"           },
    { LLM_ARCH_GROK,             "grok"             },
    { LLM_ARCH_GPT2,             "gpt2"             },
@ -674,6 +676,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_DECI,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_GATE_EXP,    "blk.%d.ffn_gate.%d" },
            { LLM_TENSOR_FFN_DOWN_EXP,    "blk.%d.ffn_down.%d" },
            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_BAICHUAN,
        {
@ -1673,6 +1701,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_FALCON_3,
    LLM_CHAT_TEMPLATE_ZEPHYR,
    LLM_CHAT_TEMPLATE_MONARCH,
    LLM_CHAT_TEMPLATE_GEMMA,
@ -1691,6 +1720,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_RWKV_WORLD,
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
    LLM_CHAT_TEMPLATE_MEGREZ,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };
@ -1705,6 +1735,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
    { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
@ -1723,6 +1754,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
 };
 static llm_arch llm_arch_from_string(const std::string & name) {
@ -5692,7 +5724,7 @@ static void llm_load_hparams(
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
-        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
@ -5732,6 +5764,15 @@ static void llm_load_hparams(
                    }
                }
            } break;
        case LLM_ARCH_DECI:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 32: model.type = e_model::MODEL_7B; break;
                    case 80: model.type = e_model::MODEL_70B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        case LLM_ARCH_MINICPM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -6562,7 +6603,8 @@ static void llm_load_vocab(
            } else if (
                    tokenizer_pre == "llama3"   ||
                    tokenizer_pre == "llama-v3" ||
-                    tokenizer_pre == "llama-bpe") {
+                    tokenizer_pre == "llama-bpe"||
                    tokenizer_pre == "falcon3") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                vocab.tokenizer_ignore_merges = true;
                vocab.tokenizer_add_bos = true;
@ -6592,7 +6634,8 @@ static void llm_load_vocab(
                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-es" ||
                    tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "jina-v2-code") {
+                    tokenizer_pre == "jina-v2-code" ||
                    tokenizer_pre == "roberta-bpe") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "refact") {
@ -6662,6 +6705,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "minerva-7b") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
            } else if (
                tokenizer_pre == "megrez") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -7935,6 +7981,68 @@ static bool llm_load_tensors(
                        }
                    }
                } break;
            case LLM_ARCH_DECI:
                {
                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    // output
                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                    // if output is NULL, init from the input tok embed
                    if (model.output == NULL) {
                        model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = model.layers[i];
                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
                        const int64_t n_ff          = hparams.n_ff(i);
                        const int64_t n_head        = hparams.n_head(i);
                        const int64_t n_head_kv     = hparams.n_head_kv(i);
                        if (n_head_kv == 0 && n_head > 0) {
                            // linear attention for DeciLMCausalModel
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                        }
                        else if (n_head_kv > 0) {
                            layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
                        }
                        // optional bias tensors
                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
                            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                        }
                        else {
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
                        }
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        // optional MLP bias
                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
                    }
                } break;
            case LLM_ARCH_MINICPM3:
                {
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
@ -11304,6 +11412,167 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_deci() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
        // mutable variable, needed during the last layer of the computation to skip unused tokens
        int32_t n_tokens = this->n_tokens;
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        GGML_ASSERT(n_embd_head == hparams.n_rot);
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
            const int64_t n_head_kv = hparams.n_head_kv(il);
            const int64_t n_head    = hparams.n_head(il);
            if (n_head == 0) {
                // attention-free layer of Llama-3_1-Nemotron-51B
                cur = inpL;
            } else {
                // norm
                cur = llm_build_norm(ctx0, inpL, hparams,
                        model.layers[il].attn_norm, NULL,
                        LLM_NORM_RMS, cb, il);
                cb(cur, "attn_norm", il);
            }
            if (n_head > 0 && n_head_kv == 0) {
                // "linear attention" of Llama-3_1-Nemotron-51B
                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
                cb(cur, "wo", il);
            } else if (n_head > 0) {
                // self-attention
                // rope freq factors for llama3; may return nullptr for llama2 and other models
                struct ggml_tensor * rope_factors = build_rope_factors(il);
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                cb(Qcur, "Qcur", il);
                if (model.layers[il].bq) {
                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                    cb(Qcur, "Qcur", il);
                }
                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                cb(Kcur, "Kcur", il);
                if (model.layers[il].bk) {
                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                    cb(Kcur, "Kcur", il);
                }
                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);
                if (model.layers[il].bv) {
                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                    cb(Vcur, "Vcur", il);
                }
                Qcur = ggml_rope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Qcur, "Qcur", il);
                Kcur = ggml_rope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Kcur, "Kcur", il);
                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
            }
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                n_tokens = n_outputs;
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            // For Granite architecture
            if (hparams.f_residual_scale) {
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
            }
            // modified to support attention-free layer of Llama-3_1-Nemotron-51B
            struct ggml_tensor * ffn_inp = cur;
            if (n_head > 0) {
                ffn_inp = ggml_add(ctx0, cur, inpSA);
                cb(ffn_inp, "ffn_inp", il);
            }
            // feed-forward network
            if (model.layers[il].ffn_gate_inp == nullptr) {
                cur = llm_build_norm(ctx0, ffn_inp, hparams,
                        model.layers[il].ffn_norm, NULL,
                        LLM_NORM_RMS, cb, il);
                cb(cur, "ffn_norm", il);
                cur = llm_build_ffn(ctx0, lctx, cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                        model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                        NULL,
                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                cb(cur, "ffn_out", il);
            }
            // For Granite architecture
            if (hparams.f_residual_scale) {
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
            }
            cur = ggml_add(ctx0, cur, ffn_inp);
            cb(cur, "ffn_out", il);
            cur = lctx.cvec.apply_to(ctx0, cur, il);
            cb(cur, "l_out", il);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        cur = llm_build_norm(ctx0, cur, hparams,
                model.output_norm, NULL,
                LLM_NORM_RMS, cb, -1);
        cb(cur, "result_norm", -1);
        // lm_head
        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
        // For Granite architecture
        if (hparams.f_logit_scale) {
            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
        }
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
    struct ggml_cgraph * build_baichuan() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@ -13333,7 +13602,13 @@ struct llm_build_context {
        struct ggml_tensor * inp_pos = build_inp_pos();
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        struct ggml_tensor * KQ_mask = nullptr;
        if (hparams.n_swa == 0) {
            // Phi-4 doesn't use sliding window attention
            KQ_mask = build_inp_KQ_mask();
        } else {
            KQ_mask = build_inp_KQ_mask_swa();
        }
        for (int il = 0; il < n_layer; ++il) {
            auto residual = inpL;
@ -13391,7 +13666,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
            }
            if (il == n_layer - 1) {
@ -17412,6 +17687,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_llama();
            } break;
        case LLM_ARCH_DECI:
            {
                result = llm.build_deci();
            } break;
        case LLM_ARCH_BAICHUAN:
            {
                result = llm.build_baichuan();
@ -20787,6 +21066,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
        case LLM_ARCH_PLAMO:
@ -22608,6 +22888,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
        }
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        return LLM_CHAT_TEMPLATE_FALCON_3;
    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
        return LLM_CHAT_TEMPLATE_ZEPHYR;
    } else if (tmpl_contains("bos_token + message['role']")) {
@ -22654,6 +22936,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_GRANITE;
    } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
        return LLM_CHAT_TEMPLATE_GIGACHAT;
    } else if (tmpl_contains("<|role_start|>")) {
        return LLM_CHAT_TEMPLATE_MEGREZ;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@ -22760,6 +23044,15 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
        // Falcon 3
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>\n" << message->content << "\n";
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
        // zephyr template
        for (auto message : chat) {
@ -23003,6 +23296,16 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "assistant<|role_sep|>";
        }
    }  else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
        // Megrez template
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
        }
        if (add_ass) {
            ss << "<|role_start|>assistant<|role_end|>";
        }
    } else {
        // template not supported
        return -1;
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -3937,7 +3937,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));
-    for (int bs : {1, 512}) {
+    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
        for (ggml_type type_a : all_types) {
            for (ggml_type type_b : {GGML_TYPE_F32}) {
                test_cases.emplace_back(new test_mul_mat(type_a, type_b, 4096, bs, 14336, {1,  1}, {1, 1}));
@ -3945,6 +3945,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
        }
    }
    for (int K : {3, 5}) {
        for (int IC : {256, 2560}) {
            for (int IW_IH : {32, 64, 256}) {
                if (IC == 2560 && IW_IH == 256) {
                    // too big
                    continue;
                }
                test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {IW_IH, IW_IH, IC, 1}, {K, K, IC, 1}, 1, 1, 1, 1, 1, 1, true));
            }
        }
    }
    return test_cases;
 }
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -77,6 +77,8 @@ int main(void) {
        "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}",
        // ai-sage/GigaChat-20B-A3B-instruct
        "{% if messages[0]['role'] == 'system' -%}\n    {%- set loop_messages = messages[1:] -%}\n    {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n    {%- set loop_messages = messages -%}\n    {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n    {% endif %}\n    \n    {%- if loop.index0 == 0 -%}\n        {{ system_message -}}\n    {%- endif -%}\n    {%- if message['role'] == 'user' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n        {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3]  + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if message['role'] == 'assistant' -%}\n        {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n    {%- endif -%}\n    {%- if loop.last and add_generation_prompt -%}\n        {{ 'assistant' + additional_special_tokens[0] -}}\n    {%- endif -%}\n{%- endfor %}",
        // Infinigence/Megrez-3B-Instruct
        u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct，将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}"
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -133,6 +135,8 @@ int main(void) {
        "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there</s>[INST] Who are you[/INST]    I am an assistant   </s>[INST] Another question[/INST]",
        // ai-sage/GigaChat-20B-A3B-instruct
        "<s>You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>   I am an assistant   <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>",
        // Infinigence/Megrez-3B-Instruct
        "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|>   I am an assistant   <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@ -634,7 +634,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
        HANDCRAFTED_KV_BAD_KEY_SIZE,
        HANDCRAFTED_KV_BAD_TYPE,
-        HANDCRAFTED_KV_BAD_VALUE_SIZE,
+        // HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit
        // HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME
        HANDCRAFTED_KV_SUCCESS,
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@ -32,13 +32,10 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
 static bool match_string(const std::string & input, llama_grammar * grammar) {
    const auto cpts = unicode_cpts_from_utf8(input);
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+    auto & stacks_cur = llama_grammar_get_stacks(grammar);
          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
    for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+        llama_grammar_accept(grammar, cpt);
        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
        if (stacks_cur.empty()) {
            // no stacks means that the grammar failed to match at this point
@ -63,7 +60,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
    auto * grammar = build_grammar(grammar_str);
    // Save the original grammar stacks so that we can reset after every new string we want to test
-    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar);
+    const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
    llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -113,12 +113,10 @@ int main()
        }
    }
    llama_grammar * grammar = NULL;
    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-    grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    if (grammar == nullptr)
+    if (grammar == nullptr) {
    {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }