Merge commit 'f8c4c073' into detokenizer

2024-07-04 18:50:04 +02:00 · 2024-07-04 18:50:04 +02:00 · 8072089c4e
commit 8072089c4e
parent 98fc182312 f8c4c0738d
456 changed files with 35475 additions and 29355 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -27,7 +27,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 RUN make -j$(nproc) llama-cli
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
-ARG LLAMA_SYCL_F16=OFF
+ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
@ -10,11 +10,11 @@ WORKDIR /app
 COPY . .
-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
+        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 && \
+RUN cmake -B build -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli
 # Clean up
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -1,84 +0,0 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp-clblast
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        OpenCL Inference of LLaMA model in C/C++
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
 Requires:       clblast
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j LLAMA_CLBLAST=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
 cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llama-clblast-cli
 %{_bindir}/llama-clblast-server
 %{_bindir}/llama-clblast-simple
 /usr/lib/systemd/system/llamaclblast.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 %build
-make -j LLAMA_CUDA=1
+make -j GGML_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV LLAMA_CUDA=1
+ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/llama-server /llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
-ARG LLAMA_SYCL_F16=OFF
+ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
@ -10,20 +10,22 @@ WORKDIR /app
 COPY . .
-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "LLAMA_SYCL_F16 is set" && \
+        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -36,15 +36,17 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV LLAMA_HIPBLAS=1
+ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev curl
 RUN make -j$(nproc) llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -5,20 +5,16 @@ FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
-# Install Vulkan SDK
+# Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
-    apt-get install -y vulkan-sdk
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Install cURL
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server
 # Clean up
@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git libcurl4-openssl-dev curl
 WORKDIR /app
@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -17,19 +17,18 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  clblast,
+  curl,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@ -56,7 +55,6 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];
@ -160,9 +158,9 @@ effectiveStdenv.mkDerivation (
    };
    postPatch = ''
-      substituteInPlace ./ggml-metal.m \
+      substituteInPlace ./ggml/src/ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml-metal.m \
+      substituteInPlace ./ggml/src/ggml-metal.m \
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
    '';
@ -198,24 +196,24 @@ effectiveStdenv.mkDerivation (
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs;
+      ++ optionals useVulkan vulkanBuildInputs
      ++ optionals enableCurl [ curl ];
    cmakeFlags =
      [
        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
+        (cmakeBool "LLAMA_CURL" enableCurl)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
+        (cmakeBool "GGML_NATIVE" false)
-        (cmakeBool "LLAMA_CUDA" useCuda)
+        (cmakeBool "GGML_BLAS" useBlas)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
+        (cmakeBool "GGML_CUDA" useCuda)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "GGML_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_VULKAN" useVulkan)
+        (cmakeBool "GGML_METAL" useMetalKit)
-        (cmakeBool "LLAMA_STATIC" enableStatic)
+        (cmakeBool "GGML_VULKAN" useVulkan)
        (cmakeBool "GGML_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
@ -231,7 +229,7 @@ effectiveStdenv.mkDerivation (
      ]
      ++ optionals useMetalKit [
        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
      ];
    # Environment variables needed for ROCm
@ -244,7 +242,7 @@ effectiveStdenv.mkDerivation (
    # if they haven't been added yet.
    postInstall = ''
      mkdir -p $out/include
-      cp $src/llama.h $out/include/
+      cp $src/include/llama.h $out/include/
    '';
    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        useVulkan
        ;
@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+      badPlatforms = optionals useCuda lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
--- a/.editorconfig
+++ b/.editorconfig
@ -28,4 +28,5 @@ indent_size = 2
 indent_style = tab
 [examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -9,5 +9,3 @@ contact_links:
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -2,31 +2,31 @@
 Kompute:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml-kompute.h
+            - ggml/include/ggml-kompute.h
-            - ggml-kompute.cpp
+            - ggml/src/ggml-kompute.cpp
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml-metal.h
+            - ggml/include/ggml-metal.h
-            - ggml-metal.cpp
+            - ggml/src/ggml-metal.cpp
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml-sycl.h
+            - ggml/include/ggml-sycl.h
-            - ggml-sycl.cpp
+            - ggml/src/ggml-sycl.cpp
            - README-sycl.md
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml-cuda.h
+            - ggml/include/ggml-cuda.h
-            - ggml-cuda/**
+            - ggml/src/ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml_vk_generate_shaders.py
+            - ggml/ggml_vk_generate_shaders.py
-            - ggml-vulkan*
+            - ggml/src/ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@ -73,10 +73,10 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml.c
+            - ggml/include/ggml*.h
-            - ggml.h
+            - ggml/src/ggml*.c
-            - ggml-*.c
+            - ggml/src/ggml*.cpp
-            - ggml-*.h
+            - ggml/src/ggml*.h
            - ggml-cuda/**
 nix:
    - changed-files:
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -109,7 +109,7 @@ jobs:
        run: |
          set -eux
          cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)
      - name: Test
@ -305,7 +305,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
@ -335,7 +335,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DLLAMA_RPC=ON ..
+          cmake -DGGML_RPC=ON ..
          cmake --build . --config Release -j $(nproc)
      - name: Test
@ -363,7 +363,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DLLAMA_VULKAN=ON ..
+          cmake -DGGML_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-hip:
@ -384,13 +384,13 @@ jobs:
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
          cmake --build build --config Release -j $(nproc)
      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
          cmake --build build2 --config Release -j $(nproc)
  ubuntu-22-cmake-sycl:
@ -431,7 +431,7 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-sycl-fp16:
@ -472,10 +472,10 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
+          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)
-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
@ -497,15 +497,15 @@ jobs:
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: make_test
        run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
@ -529,7 +529,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -559,13 +559,14 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DLLAMA_METAL_EMBED_LIBRARY=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@ -588,13 +589,14 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DLLAMA_METAL_EMBED_LIBRARY=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
  macOS-latest-swift:
    runs-on: macos-latest
@ -662,7 +664,7 @@ jobs:
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            make LLAMA_OPENBLAS=1 -j $(nproc)
+            make GGML_OPENBLAS=1 -j $(nproc)
      - name: Build using CMake
        shell: msys2 {0}
@ -678,7 +680,7 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
  windows-latest-cmake:
@ -693,25 +695,25 @@ jobs:
      matrix:
        include:
          - build: 'rpc-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'openblas-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -724,7 +726,7 @@ jobs:
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
-          git submodule update --init kompute
+          git submodule update --init ggml/src/kompute
      - name: Download OpenBLAS
        id: get_openblas
@ -797,6 +799,7 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
      - name: Determine tag name
@ -854,7 +857,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Determine tag name
@ -987,7 +990,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
          cmake --build build --config Release
  ios-xcode-build:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,10 +10,11 @@
 name: Publish Docker image
 on:
-  pull_request:
+  #pull_request:
  push:
    branches:
      - master
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -22,7 +23,7 @@ concurrency:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
+    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
@ -33,15 +34,13 @@ jobs:
          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -30,7 +30,7 @@ jobs:
    strategy:
      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
@ -92,12 +92,12 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DLLAMA_OPENMP=OFF ;
+              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build
@ -105,7 +105,7 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-              -DLLAMA_NATIVE=OFF \
+              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
--- a/.gitignore
+++ b/.gitignore
@ -56,6 +56,7 @@ CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
 llama-batched-swift
 /rpc-server
 out/
 tmp/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = kompute
+	path = ggml/src/kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/129
+++ b/129
@ -1,8 +1,9 @@
-# date: Tue Apr  9 09:17:14 EEST 2024
+# date: Wed Jun 26 19:36:34 EEST 2024
 # this file is auto-generated by scripts/gen-authors.sh
 0cc4m <picard12@live.de>
 0xspringtime <110655352+0xspringtime@users.noreply.github.com>
 20kdc <asdd2808@gmail.com>
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
@ -11,14 +12,18 @@ AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
 Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
 Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
 Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 Aisuko <urakiny@gmail.com>
 Akarshan Biswas <akarshanbiswas@fedoraproject.org>
 Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
@ -35,19 +40,24 @@ Ali Nehzat <ali.nehzat@thanks.dev>
 Ali Tariq <ali.tariq@10xengineers.ai>
 Alon <alonfaraj@gmail.com>
 AlpinDale <52078762+AlpinDale@users.noreply.github.com>
 Amir <amir_zia@outlook.com>
 AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
 Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
 Andy Tai <andy-tai@users.noreply.github.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
 Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
 Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
 Atsushi Tatsuma <yoshoku@outlook.com>
@ -57,35 +67,46 @@ BADR <contact@pythops.com>
 Bach Le <bach@bullno1.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
 Bartowski <ckealty1182@gmail.com>
 Behnam M <58621210+ibehnam@users.noreply.github.com>
 Ben Ashbaugh <ben.ashbaugh@intel.com>
 Ben Garney <bengarney@users.noreply.github.com>
 Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
 Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
 Bingan <70050083+binganao@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
 Brian <mofosyne@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
 Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
 CRD716 <crd716@gmail.com>
 Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
 Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
 Chao Jiang <jc19chaoj@zoho.com>
 Cheng Shao <terrorjack@type.dance>
 Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
 Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
 CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
@ -95,8 +116,12 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
 Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
 Dave <dave-fl@users.noreply.github.com>
 Dave Airlie <airlied@gmail.com>
 Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
 David Friehs <david@friehs.info>
 David Kennedy <dakennedyd@gmail.com>
@ -104,10 +129,13 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
 Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
 Deven Mistry <31466137+deven367@users.noreply.github.com>
 Didzis Gosko <didzis@users.noreply.github.com>
 Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
 Eddie-Wang <wangjinheng1120@163.com>
 Edward Taylor <edeetee@gmail.com>
 Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
 Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Sommerlade <es0m@users.noreply.github.com>
@ -143,37 +174,47 @@ Firat <firatkiral@gmail.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
 Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
 Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
 Gary Mulder <gjmulder@gmail.com>
 Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
 Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
 Haggai Nuchi <h.nuchi@gmail.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
 Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
 HanishKVC <hanishkvc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
 HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
 Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
 Huawei Lin <huaweilin.cs@gmail.com>
 Hugo Roussel <hugo.rous@gmail.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
@ -190,8 +231,10 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
 Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
 James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
 James Reynolds <magnusviri@users.noreply.github.com>
 Jan Boon <jan.boon@kaetemi.be>
 Jan Boon <kaetemi@gmail.com>
@ -205,12 +248,17 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
 Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
 Jiahao Li <liplus17@163.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
 Jiří Sejkora <Sejseloid@gmail.com>
 Joan Fontanals <jfontanalsmartinez@gmail.com>
 Joan Fontanals <joan.fontanals.martinez@jina.ai>
 Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
 John <78893154+cmp-nct@users.noreply.github.com>
@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
 Jorge A <161275481+jorgealias@users.noreply.github.com>
 Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
 Joseph Stahl <1269177+josephst@users.noreply.github.com>
 Josh Ramer <josh.ramer@icloud.com>
 Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
 Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
 Justin Suess <justin.suess@westpoint.edu>
 Justina Cho <justcho5@gmail.com>
 Justine Tunney <jtunney@gmail.com>
 Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
 KASR <karim.asrih@gmail.com>
 Kamil Tomšík <info@tomsik.cz>
@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
 Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
@ -257,6 +310,7 @@ Laura <Tijntje_7@msn.com>
 Lee <44310445+lx200916@users.noreply.github.com>
 Lee Drake <b.lee.drake@gmail.com>
 Leng Yue <lengyue@lengyue.me>
 Leon Knauer <git@leonknauer.com>
 LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
@ -265,20 +319,26 @@ LoganDark <github@logandark.mozmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
 Lyle Dean <dean@lyle.dev>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
 Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
 Markus Tavenrath <mtavenrath@users.noreply.github.com>
 Martin Delille <martin@delille.org>
 Martin Krasser <krasserm@googlemail.com>
 Martin Schwaighofer <mschwaig@users.noreply.github.com>
 Marvin Gießing <marvin.giessing@gmail.com>
 Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
 MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
@ -287,8 +347,11 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
 Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
 Max Krasnyansky <max.krasnyansky@gmail.com>
 Max Krasnyansky <quic_maxk@quicinc.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
@ -300,32 +363,41 @@ Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
 Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
 Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
 Nathan Epstein <nate2@umbc.edu>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
 Neo Zhang <14088817+arthw@users.noreply.github.com>
 Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
 Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
 Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
 Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavol Rusnak <pavol@rusnak.io>
 Pedro Cuenca <pedro@huggingface.co>
@ -343,9 +415,14 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
 Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
 Raj Hammeer Singh Hada <hammeerraj@gmail.com>
 Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Reinforce-II <fate@eastal.com>
 Ren Xuancheng <jklj077@users.noreply.github.com>
 Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
@ -373,6 +450,7 @@ Rowan Hart <rowanbhart@gmail.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
 Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
 Sergio López <slp@sinrega.org>
 Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
@ -394,6 +473,7 @@ Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
 Shuichi Tsutsumi <shuichi0526@gmail.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
@ -405,11 +485,14 @@ Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
 Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
 Spencer Sutton <spencersutton@users.noreply.github.com>
 Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
 Srinivas Billa <nivibilla@gmail.com>
 Stefan Sydow <stefan@sydow.email>
 Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
 Steve Grubb <ausearch.1@gmail.com>
 Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
@ -434,16 +517,19 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
 Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
 Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
 Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
@ -455,7 +541,9 @@ Weird Constructor <weirdconstructor@gmail.com>
 Welby Seely <welbyseely@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
 William Tambellini <william.tambellini@gmail.com>
 Willy Tarreau <w@1wt.eu>
 Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
 Wu Jian Ping <wujjpp@hotmail.com>
 Wu Jian Ping <wujp@greatld.com>
 Xiake Sun <xiake.sun@intel.com>
@ -466,6 +554,8 @@ Xiaoyi Chen <cxychina@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
 Xuan Son Nguyen <thichthat@gmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
 Yaroslav <yaroslav.yashin@me.com>
 Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
@ -477,6 +567,7 @@ Zane Shannon <z@zcs.me>
 Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
 Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@ -484,14 +575,18 @@ Zsapi <martin1.zsapka@gmail.com>
 a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
 adel boussaken <netdur@gmail.com>
 afrideva <95653597+afrideva@users.noreply.github.com>
 agray3 <agray3@users.noreply.github.com>
 akawrykow <142945436+akawrykow@users.noreply.github.com>
 alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
 alwqx <kenan3015@gmail.com>
 amd-lalithnc <lalithnc@amd.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
 apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
 arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
 coezbek <c.oezbek@gmail.com>
 comex <comexk@gmail.com>
 compilade <113953597+compilade@users.noreply.github.com>
 compilade <git@compilade.net>
 cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
 david raistrick <keen99@users.noreply.github.com>
 ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
 divinity76 <divinity76@gmail.com>
 dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
 drbh <david.richard.holtz@gmail.com>
 ds5t5 <145942675+ds5t5@users.noreply.github.com>
@ -529,6 +628,7 @@ eastriver <lee@eastriver.dev>
 ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
 fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
 hopkins385 <98618192+hopkins385@users.noreply.github.com>
 howlger <eclipse@voormann.de>
 howlger <github@voormann.de>
 hutli <6594598+hutli@users.noreply.github.com>
@ -549,14 +650,22 @@ hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
 intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
 jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
 jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
 joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 johnson442 <56517414+johnson442@users.noreply.github.com>
 jojorne <jojorne@users.noreply.github.com>
 jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 jp-x-g <jpxg-dev@protonmail.com>
 jukofyork <69222624+jukofyork@users.noreply.github.com>
 junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
 jwj7140 <32943891+jwj7140@users.noreply.github.com>
 k.h.lai <adrian.k.h.lai@outlook.com>
 kaizau <kaizau@users.noreply.github.com>
 kalomaze <66376113+kalomaze@users.noreply.github.com>
 kang <tpdns9032100@gmail.com>
@ -575,11 +684,15 @@ ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
 liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
 loonerin <132926317+loonerin@users.noreply.github.com>
 luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
 maor-ps <154728172+maor-ps@users.noreply.github.com>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
 nhamanasu <45545786+nhamanasu@users.noreply.github.com>
 niansa/tuxifan <anton-sa@web.de>
 niansa/tuxifan <tuxifan@posteo.de>
 nickp27 <nb.porter@gmail.com>
 ningshanwutuobang <ningshanwutuobang@gmail.com>
 nold <Nold360@users.noreply.github.com>
 nopperl <54780682+nopperl@users.noreply.github.com>
 nusu-github <29514220+nusu-github@users.noreply.github.com>
 olexiyb <olexiyb@gmail.com>
 omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
 pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
 pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
 qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
@ -614,16 +731,19 @@ rhuddleston <ryan.huddleston@percona.com>
 rimoliga <53384203+rimoliga@users.noreply.github.com>
 runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
 sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
 sjinzh <sjinzh@gmail.com>
 sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
 strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
@ -636,12 +756,16 @@ uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
 valiray <133289098+valiray@users.noreply.github.com>
 vik <vikhyatk@gmail.com>
 viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
 woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
 woodx <124784234+woodx9@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
@ -649,7 +773,10 @@ xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
 zakkor <edward.partenie@gmail.com>
 zhangkaihuo <zhangkaihuo@gmail.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
 zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
 Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
 源文雨 <41315874+fumiama@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -11,10 +11,23 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-
+    {
        "name": "sycl-base",
        "hidden": true,
        "generator": "Ninja",
        "binaryDir": "${sourceDir}/build-${presetName}",
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
            "CMAKE_C_COMPILER": "cl",
            "GGML_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
    {
        "name": "arm64-windows-msvc", "hidden": true,
@ -35,15 +48,18 @@
    },
    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
  ]
 }
--- a/1043
+++ b/1043
--- a/Package.swift
+++ b/Package.swift
@ -3,14 +3,13 @@
 import PackageDescription
 var sources = [
-    "ggml.c",
+    "src/llama.cpp",
-    "sgemm.cpp",
+    "src/unicode.cpp",
-    "llama.cpp",
+    "src/unicode-data.cpp",
-    "unicode.cpp",
+    "ggml/src/ggml.c",
-    "unicode-data.cpp",
+    "ggml/src/ggml-alloc.c",
-    "ggml-alloc.c",
+    "ggml/src/ggml-backend.c",
-    "ggml-backend.c",
+    "ggml/src/ggml-quants.c",
    "ggml-quants.c",
 ]
 var resources: [Resource] = []
@ -26,8 +25,8 @@ var cSettings: [CSetting] =  [
 ]
 #if canImport(Darwin)
-sources.append("ggml-metal.m")
+sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml-metal.metal"))
+resources.append(.process("ggml/src/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
@ -63,8 +62,6 @@ let package = Package(
               "models",
               "tests",
               "CMakeLists.txt",
               "ggml-cuda.cu",
               "ggml-cuda.h",
               "Makefile"
            ],
            sources: sources,
--- a/README-sycl.md
+++ b/README-sycl.md
@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
 ```
 *Notes*:
-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
 # Build LLAMA with MKL BLAS acceleration for intel GPU
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 # build all binary
 cmake --build build --config Release -j -v
@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 # build all binary
 cmake --build build --config Release -j -v
@ -410,15 +410,9 @@ Output (example):
 4. Install build tools
-a. Download & install cmake for Windows: https://cmake.org/download/
+a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
 b. Download & install mingw-w64 make for Windows provided by w64devkit
 - Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
 - Extract `w64devkit` on your pc.
 - Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
 ### II. Build llama.cpp
@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 # Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
 cmake --build build --config Release -j
 ```
@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
 .\examples\sycl\win-build-sycl.bat
 ```
 Or, use CMake presets to build:
 ```sh
 cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
 cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```
 Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
 *Notes:*
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
 ### III. Run the inference
@ -536,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name               | Value                             | Function                                    |
 |--------------------|-----------------------------------|---------------------------------------------|
-| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
+| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.           |
-| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Recent API changes
 - [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
@ -107,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@ -216,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 **Tools:**
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 **Infrastructure:**
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 ---
@ -415,7 +422,7 @@ Flox follows the nixpkgs build of llama.cpp.
 ### Metal Build
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
+To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
@ -435,7 +442,7 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `make`:
    - On Linux:
      ```bash
-      make LLAMA_OPENBLAS=1
+      make GGML_OPENBLAS=1
      ```
    - On Windows:
@ -450,13 +457,13 @@ Building the program with BLAS support may lead to some performance improvements
      8. From here you can run:
          ```bash
-          make LLAMA_OPENBLAS=1
+          make GGML_OPENBLAS=1
          ```
  - Using `CMake` on Linux:
      ```bash
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
      cmake --build build --config Release
      ```
@ -475,10 +482,10 @@ Building the program with BLAS support may lead to some performance improvements
  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
  - Using manual oneAPI installation:
-    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+    By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
+      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
      cmake --build build --config Release
      ```
@ -495,27 +502,28 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `make`:
    ```bash
-    make LLAMA_CUDA=1
+    make GGML_CUDA=1
    ```
  - Using `CMake`:
    ```bash
-    cmake -B build -DLLAMA_CUDA=ON
+    cmake -B build -DGGML_CUDA=ON
    cmake --build build --config Release
    ```
  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
  | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+  | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                               |
+  | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. |                                                                                                                                         |
+  | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+  | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+  | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
+  | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
+  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
  | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 - #### hipBLAS
@ -525,15 +533,15 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `make`:
    ```bash
-    make LLAMA_HIPBLAS=1
+    make GGML_HIPBLAS=1
    ```
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
    Note that if you get the following error:
@ -547,19 +555,19 @@ Building the program with BLAS support may lead to some performance improvements
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build -- -j 16
    ```
  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+    make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
    ```
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
-    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
    cmake --build build
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@ -571,10 +579,10 @@ Building the program with BLAS support may lead to some performance improvements
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
  | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-  |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+  | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
+  | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 - #### Vulkan
@ -612,7 +620,7 @@ Building the program with BLAS support may lead to some performance improvements
  Then, build llama.cpp using the cmake command below:
  ```bash
-  cmake -B build -DLLAMA_VULKAN=1
+  cmake -B build -DGGML_VULKAN=1
  cmake --build build --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
--- a/ci/run.sh
+++ b/ci/run.sh
@ -36,11 +36,11 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        exit 1
    fi
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 ## helpers
@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -550,7 +550,7 @@ function gg_run_pythia_2_8b {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
@ -0,0 +1,22 @@
 find_package(Git)
 # the commit's SHA1
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_SHA1
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the date of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_DATE
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the subject of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%s
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@ -2,11 +2,12 @@ set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
 set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
-set(LLAMA_BLAS @LLAMA_BLAS@)
+
-set(LLAMA_CUDA @LLAMA_CUDA@)
+set(GGML_BLAS       @GGML_BLAS@)
-set(LLAMA_METAL @LLAMA_METAL@)
+set(GGML_CUDA       @GGML_CUDA@)
-set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
+set(GGML_METAL      @GGML_METAL@)
-set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
+set(GGML_HIPBLAS    @GGML_HIPBLAS@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
@ -17,25 +18,26 @@ set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
 find_package(Threads REQUIRED)
-if (APPLE AND LLAMA_ACCELERATE)
+
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
 endif()
-if (LLAMA_BLAS)
+if (GGML_BLAS)
    find_package(BLAS REQUIRED)
 endif()
-if (LLAMA_CUDA)
+if (GGML_CUDA)
    find_package(CUDAToolkit REQUIRED)
 endif()
-if (LLAMA_METAL)
+if (GGML_METAL)
    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
    find_library(METAL_FRAMEWORK Metal REQUIRED)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
-if (LLAMA_HIPBLAS)
+if (GGML_HIPBLAS)
    find_package(hip REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
@ -47,7 +49,9 @@ find_library(llama_LIBRARY llama
 set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
 set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
 add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -1,5 +1,6 @@
 # common
 find_package(Threads REQUIRED)
 # Build info header
 #
@ -36,7 +37,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
--- a/common/cmake/build-info-gen-cpp.cmake
+++ b/common/cmake/build-info-gen-cpp.cmake
@ -1,4 +1,4 @@
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
 // CLI argument parsing
 //
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@ -152,7 +158,6 @@ struct gpt_params {
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
    bool embedding         = false; // get only sentence embedding
    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@ -179,6 +184,12 @@ struct gpt_params {
    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)
    // embedding
    bool embedding         = false; // get only sentence embedding
    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
    std::string embd_sep   = "\n";  // separator of embendings
    // server params
    int32_t port           = 8080;         // server listens on this network port
    int32_t timeout_read   = 600;          // http read timeout in seconds
@ -189,6 +200,7 @@ struct gpt_params {
    std::string public_path   = "";
    std::string chat_template = "";
    std::string system_prompt = "";
    bool enable_chat_template = true;
    std::vector<std::string> api_keys;
@ -233,13 +245,14 @@ struct gpt_params {
    bool compute_ppl    = true;  // whether to compute perplexity
    // cvector-generator params
-    int n_completions = 64;
+    int n_pca_batch = 100;
    int n_pca_batch = 20;
    int n_pca_iterations = 1000;
    dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };
 void gpt_params_handle_model_default(gpt_params & params);
@ -352,9 +365,34 @@ bool llama_should_add_bos_token(const llama_model * model);
 // Chat template utils
 //
 // same with llama_chat_message, but uses std::string
 struct llama_chat_msg {
    std::string role;
    std::string content;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool llama_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & chat,
        bool add_ass);
 // Format single message, while taking into account the position of that message in chat history
 std::string llama_chat_format_single(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass);
 // Returns an example of formatted chat
 std::string llama_chat_format_example(const struct llama_model * model,
        const std::string & tmpl);
 //
 // KV cache utils
 //
@ -369,7 +407,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
 // Embedding utils
 //
-void llama_embd_normalize(const float * inp, float * out, int n);
+void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -413,4 +451,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 void yaml_dump_non_result_info(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }
 /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
 class string_view {
    const std::string & _str;
    const size_t _start;
    const size_t _end;
 public:
    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
    size_t size() const {
        return _end - _start;
    }
    size_t length() const {
        return size();
    }
    operator std::string() const {
        return str();
    }
    std::string str() const {
        return _str.substr(_start, _end - _start);
    }
    string_view substr(size_t pos, size_t len = std::string::npos) const {
        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
    }
    char operator[](size_t pos) const {
        auto index = _start + pos;
        if (index >= _end) {
            throw std::out_of_range("string_view index out of range");
        }
        return _str[_start + pos];
    }
    bool operator==(const string_view & other) const {
        std::string this_str = *this;
        std::string other_str = other;
        return this_str == other_str;
    }
 };
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int>::min();
    auto has_max = max_value != std::numeric_limits<int>::max();
    auto digit_range = [&](char from, char to) {
        out << "[";
        if (from == to) {
            out << from;
        } else {
            out << from << "-" << to;
        }
        out << "]";
    };
    auto more_digits = [&](int min_digits, int max_digits) {
        out << "[0-9]";
        if (min_digits == max_digits && min_digits == 1) {
            return;
        }
        out << "{";
        out << min_digits;
        if (max_digits != min_digits) {
            out << ",";
            if (max_digits != std::numeric_limits<int>::max()) {
                out << max_digits;
            }
        }
        out << "}";
    };
    std::function<void(const string_view &, const string_view &)> uniform_range =
        [&](const string_view & from, const string_view & to) {
            size_t i = 0;
            while (i < from.length() && i < to.length() && from[i] == to[i]) {
                i++;
            }
            if (i > 0) {
                out << "\"" << from.substr(0, i).str() << "\"";
            }
            if (i < from.length() && i < to.length()) {
                if (i > 0) {
                    out << " ";
                }
                auto sub_len = from.length() - i - 1;
                if (sub_len > 0) {
                    auto from_sub = from.substr(i + 1);
                    auto to_sub = to.substr(i + 1);
                    auto sub_zeros = repeat("0", sub_len);
                    auto sub_nines = repeat("9", sub_len);
                    auto to_reached = false;
                    out << "(";
                    if (from_sub == sub_zeros) {
                        digit_range(from[i], to[i] - 1);
                        out << " ";
                        more_digits(sub_len, sub_len);
                    } else {
                        out << "[" << from[i] << "] ";
                        out << "(";
                        uniform_range(from_sub, sub_nines);
                        out << ")";
                        if (from[i] < to[i] - 1) {
                            out << " | ";
                            if (to_sub == sub_nines) {
                                digit_range(from[i] + 1, to[i]);
                                to_reached = true;
                            } else {
                                digit_range(from[i] + 1, to[i] - 1);
                            }
                            out << " ";
                            more_digits(sub_len, sub_len);
                        }
                    }
                    if (!to_reached) {
                        out << " | ";
                        digit_range(to[i], to[i]);
                        out << " ";
                        uniform_range(sub_zeros, to_sub);
                    }
                    out << ")";
                } else {
                    out << "[" << from[i] << "-" << to[i] << "]";
                }
            }
        };
    if (has_min && has_max) {
        if (min_value < 0 && max_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
            out << ")";
            return;
        }
        if (min_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
            out << ") | ";
            min_value = 0;
        }
        auto min_s = std::to_string(min_value);
        auto max_s = std::to_string(max_value);
        auto min_digits = min_s.length();
        auto max_digits = max_s.length();
        for (auto digits = min_digits; digits < max_digits; digits++) {
            uniform_range(min_s, repeat("9", digits));
            min_s = "1" + repeat("0", digits);
            out << " | ";
        }
        uniform_range(min_s, max_s);
        return;
    }
    auto less_decimals = std::max(decimals_left - 1, 1);
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
            if (top_level) {
                out << "[0] | [1-9] ";
                more_digits(0, less_decimals);
            } else {
                more_digits(1, decimals_left);
            }
        } else if (min_value <= 9) {
            char c = '0' + min_value;
            auto range_start = top_level ? '1' : '0';
            if (c > range_start) {
                digit_range(range_start, c - 1);
                out << " ";
                more_digits(1, less_decimals);
                out << " | ";
            }
            digit_range(c, '9');
            out << " ";
            more_digits(0, less_decimals);
        } else {
            auto min_s = std::to_string(min_value);
            auto len = min_s.length();
            auto c = min_s[0];
            if (c > '1') {
                digit_range(top_level ? '1' : '0', c - 1);
                out << " ";
                more_digits(len, less_decimals);
                out << " | ";
            }
            digit_range(c, c);
            out << " (";
            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
                digit_range(c + 1, '9');
                out << " ";
                more_digits(len - 1, less_decimals);
            }
        }
        return;
    }
    if (has_max) {
        if (max_value >= 0) {
            if (top_level) {
                out << "\"-\" [1-9] ";
                more_digits(0, less_decimals);
                out << " | ";
            }
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
    }
    throw std::runtime_error("At least one of min_value or max_value must be set");
 }
 const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
 struct BuiltinRule {
@ -89,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };
 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }
 class SchemaConverter {
 private:
    std::function<json(const std::string &)> _fetch_json;
@ -388,6 +614,75 @@ private:
        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
    }
    /*
        Returns a rule that matches a JSON string that is none of the provided strings
        not_strings({"a"})
            -> ["] ( [a] char+ | [^"a] char* )? ["] space
        not_strings({"and", "also"})
            -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
    */
    std::string _not_strings(const std::vector<std::string> & strings) {
        struct TrieNode {
            std::map<char, TrieNode> children;
            bool is_end_of_string;
            TrieNode() : is_end_of_string(false) {}
            void insert(const std::string & string) {
                auto node = this;
                for (char c : string) {
                    node = &node->children[c];
                }
                node->is_end_of_string = true;
            }
        };
        TrieNode trie;
        for (const auto & s : strings) {
            trie.insert(s);
        }
        std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
        std::ostringstream out;
        out << "[\"] ( ";
        std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
            std::ostringstream rejects;
            auto first = true;
            for (const auto & kv : node.children) {
                rejects << kv.first;
                if (first) {
                    first = false;
                } else {
                    out << " | ";
                }
                out << "[" << kv.first << "]";
                if (!kv.second.children.empty()) {
                    out << " (";
                    visit(kv.second);
                    out << ")";
                } else if (kv.second.is_end_of_string) {
                    out << " " << char_rule << "+";
                }
            }
            if (!node.children.empty()) {
                if (!first) {
                    out << " | ";
                }
                out << "[^\"" << rejects.str() << "] " << char_rule << "*";
            }
        };
        visit(trie);
        out << " )";
        if (!trie.is_end_of_string) {
            out << "?";
        }
        out << " [\"] space";
        return out.str();
    }
    std::string _resolve_ref(const std::string & ref) {
        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
@ -408,6 +703,7 @@ private:
        std::vector<std::string> required_props;
        std::vector<std::string> optional_props;
        std::unordered_map<std::string, std::string> prop_kv_rule_names;
        std::vector<std::string> prop_names;
        for (const auto & kv : properties) {
            const auto &prop_name = kv.first;
            const auto &prop_schema = kv.second;
@ -422,11 +718,18 @@ private:
            } else {
                optional_props.push_back(prop_name);
            }
            prop_names.push_back(prop_name);
        }
-        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
+        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
-            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
+            std::string value_rule =
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
                : _add_primitive("value", PRIMITIVE_RULES.at("value"));
            auto key_rule =
                prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
                : _add_rule(sub_name + "-k", _not_strings(prop_names));
            std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@ -452,15 +755,11 @@ private:
                }
                std::string k = ks[0];
                std::string kv_rule_name = prop_kv_rule_names[k];
-                if (k == "*") {
+                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
-                    res = _add_rule(
+                if (first_is_optional) {
-                        name + (name.empty() ? "" : "-") + "additional-kvs",
+                    res = comma_ref + (k == "*" ? "*" : "?");
                        kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
                    );
                } else if (first_is_optional) {
                    res = "( \",\" space " + kv_rule_name + " )?";
                } else {
-                    res = kv_rule_name;
+                    res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
                }
                if (ks.size() > 1) {
                    res += " " + _add_rule(
@ -594,17 +893,19 @@ public:
        } else if (schema_type.is_array()) {
            std::vector<json> schema_types;
            for (const auto & t : schema_type) {
-                schema_types.push_back({{"type", t}});
+                json schema_copy(schema);
                schema_copy["type"] = t;
                schema_types.push_back(schema_copy);
            }
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
        } else if (schema.contains("const")) {
-            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
        } else if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
-            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+            return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
        } else if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@ -686,6 +987,24 @@ public:
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int min_value = std::numeric_limits<int>::min();
            int max_value = std::numeric_limits<int>::max();
            if (schema.contains("minimum")) {
                min_value = schema["minimum"].get<int>();
            } else if (schema.contains("exclusiveMinimum")) {
                min_value = schema["exclusiveMinimum"].get<int>() + 1;
            }
            if (schema.contains("maximum")) {
                max_value = schema["maximum"].get<int>();
            } else if (schema.contains("exclusiveMaximum")) {
                max_value = schema["exclusiveMaximum"].get<int>() - 1;
            }
            std::stringstream out;
            out << "(";
            _build_min_max_int(min_value, max_value, out);
            out << ") space";
            return _add_rule(rule_name, out.str());
        } else if (schema.empty() || schema_type == "object") {
            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
        } else {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-        result->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
        if (grammar == nullptr) {
            throw std::runtime_error("Failed to initialize llama_grammar");
        }
        result->grammar = grammar;
    }
    result->prev.resize(params.n_prev);
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
    if (!ctx->parsed_grammar.rules.empty()) {
        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-        ctx->grammar = llama_grammar_init(
+        struct llama_grammar * grammar = llama_grammar_init(
                grammar_rules.data(),
                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
        if (grammar == nullptr) {
            throw std::runtime_error("Failed to initialize llama_grammar");
        }
        ctx->grammar = grammar;
    }
    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -85,6 +85,10 @@ models = [
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
 ]
@ -214,7 +218,7 @@ src_func = f"""
 """
 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
-convert_py = convert_py_pth.read_text()
+convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
@ -222,7 +226,7 @@ convert_py = re.sub(
    flags=re.DOTALL | re.MULTILINE,
 )
-convert_py_pth.write_text(convert_py)
+convert_py_pth.write_text(convert_py, encoding="utf-8")
 logger.info("+++ convert-hf-to-gguf.py was updated")
@ -271,7 +275,8 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
+    "Cửa Việt", # llama-bpe fails on this
    " discards",
    chktxt,
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -65,7 +65,8 @@ class Model:
    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
                 model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
        self.dir_model = dir_model
@ -80,7 +81,7 @@ class Model:
        if not self.is_safetensors:
            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
        self.hparams = Model.load_hparams(self.dir_model)
-        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
+        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
        self.tensor_names = None
        if self.ftype == gguf.LlamaFileType.GUESSED:
@ -96,7 +97,8 @@ class Model:
        ftype_lw: str = ftype_up.lower()
        # allow templating the file name with the output ftype, useful with the "auto" ftype
        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
-        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
                                           split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
    @classmethod
    def __init_subclass__(cls):
@ -332,6 +334,8 @@ class Model:
        self.gguf_writer.close()
    def write_vocab(self):
        if len(self.gguf_writer.tensors) != 1:
            raise ValueError('Splitting the vocabulary is not supported')
        self.gguf_writer.write_header_to_file(self.fname_out)
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.close()
@ -483,6 +487,12 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
            # ref: https://huggingface.co/core42/jais-13b
            res = "jais"
        if res is None:
            logger.warning("\n")
@ -569,7 +579,19 @@ class Model:
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
-    def _set_vocab_sentencepiece(self):
+    def _set_vocab_sentencepiece(self, add_to_gguf=True):
        tokens, scores, toktypes = self._create_vocab_sentencepiece()
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def _create_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor
        tokenizer_path = self.dir_model / 'tokenizer.model'
@ -631,14 +653,7 @@ class Model:
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
-        self.gguf_writer.add_tokenizer_model("llama")
+        return tokens, scores, toktypes
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_llama_hf(self):
        vocab = gguf.LlamaHfVocab(self.dir_model)
@ -967,7 +982,11 @@ class XverseModel(Model):
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-        assert max(tokenizer.vocab.values()) < vocab_size
+        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
        # because vocab_size is the count of items, and indexes start at 0.
        max_vocab_index = max(tokenizer.get_vocab().values())
        if max_vocab_index >= vocab_size:
            raise ValueError("Vocabulary size exceeds expected maximum size.")
        reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()
@ -1400,6 +1419,48 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
    model_arch = gguf.MODEL_ARCH.BITNET
    def set_vocab(self):
        self._set_vocab_sentencepiece()
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)
    def weight_quant(self, weight):
        dtype = weight.dtype
        weight = weight.float()
        s = 1 / weight.abs().mean().clamp(min=1e-5)
        weight = (weight * s).round().clamp(-1, 1) / s
        scale = weight.abs().max().unsqueeze(0)
        weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
        weight = torch.sign(weight).type(dtype)
        return weight.type(dtype), scale.type(torch.float32)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        new_name = self.map_tensor_name(name)
        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
            gguf.MODEL_TENSOR.ATTN_Q,
            gguf.MODEL_TENSOR.ATTN_K,
            gguf.MODEL_TENSOR.ATTN_V,
            gguf.MODEL_TENSOR.ATTN_OUT,
            gguf.MODEL_TENSOR.FFN_UP,
            gguf.MODEL_TENSOR.FFN_DOWN,
            gguf.MODEL_TENSOR.FFN_GATE,
        ]):
            # transform weight into 1/0/-1 (in fp32)
            weight_torch, scale_torch = self.weight_quant(data_torch)
            yield (new_name, weight_torch)
            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
        else:
            yield (new_name, data_torch)
@Model.register("GrokForCausalLM")
 class GrokModel(Model):
    model_arch = gguf.MODEL_ARCH.GROK
@ -1881,7 +1942,7 @@ class Phi3MiniModel(Model):
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')
-        if rope_scaling_type == 'su':
+        if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
        elif rope_scaling_type == 'yarn':
            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@ -2255,6 +2316,8 @@ class GemmaModel(Model):
        special_vocab._set_special_token("eot",    107)
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@ -2287,6 +2350,71 @@ class GemmaModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Gemma2ForCausalLM")
 class Gemma2Model(Model):
    model_arch = gguf.MODEL_ARCH.GEMMA2
    def set_vocab(self):
        tokens, scores, toktypes = self._create_vocab_sentencepiece()
        # hack: This is required so that we can properly use start/end-of-turn for chat template
        for i in range(108):
            # including <unusedX>, <start_of_turn>, <end_of_turn>
            toktypes[i] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_space_prefix(False)
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_key_length(hparams["head_dim"])
        self.gguf_writer.add_value_length(hparams["head_dim"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_attn_logit_softcapping(
            self.hparams["attn_logit_softcapping"]
        )
        self.gguf_writer.add_final_logit_softcapping(
            self.hparams["final_logit_softcapping"]
        )
        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
        # sanity check
        attn_scalar = self.hparams["query_pre_attn_scalar"]
        if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
            raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unusem
        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
        # To prevent errors, skip loading lm_head.weight.
        if name == "lm_head.weight":
            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
            return []
        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
        if name.endswith("norm.weight"):
            data_torch = data_torch + 1
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2
@ -2725,6 +2853,214 @@ class DeepseekV2Model(Model):
                raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("T5ForConditionalGeneration")
@Model.register("T5WithLMHeadModel")
 class T5Model(Model):
    model_arch = gguf.MODEL_ARCH.T5
    def set_vocab(self):
        # to avoid TypeError: Descriptors cannot be created directly
        # exception when importing sentencepiece_model_pb2
        os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
        from sentencepiece import SentencePieceProcessor
        from sentencepiece import sentencepiece_model_pb2 as model
        tokenizer_path = self.dir_model / 'spiece.model'
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")
        sentencepiece_model = model.ModelProto()
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
        assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
        tokenizer = SentencePieceProcessor()
        tokenizer.LoadFromFile(str(tokenizer_path))
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
        for token_id in range(tokenizer.vocab_size()):
            piece = tokenizer.IdToPiece(token_id)
            text = piece.encode("utf-8")
            score = tokenizer.GetScore(token_id)
            toktype = SentencePieceTokenTypes.NORMAL
            if tokenizer.IsUnknown(token_id):
                toktype = SentencePieceTokenTypes.UNKNOWN
            elif tokenizer.IsControl(token_id):
                toktype = SentencePieceTokenTypes.CONTROL
            elif tokenizer.IsUnused(token_id):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            tokens[token_id] = text
            scores[token_id] = score
            toktypes[token_id] = toktype
        added_tokens_file = self.dir_model / 'added_tokens.json'
        if added_tokens_file.is_file():
            with open(added_tokens_file, "r", encoding="utf-8") as f:
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    token_id = added_tokens_json[key]
                    if (token_id >= vocab_size):
                        logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                        continue
                    tokens[token_id] = key.encode("utf-8")
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
        if vocab_size > len(tokens):
            pad_count = vocab_size - len(tokens)
            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
            for i in range(1, pad_count + 1):
                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
        self.gguf_writer.add_tokenizer_model("t5")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        self.gguf_writer.add_add_space_prefix(add_prefix)
        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
        if precompiled_charsmap:
            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
        self.gguf_writer.add_add_bos_token(False)
        self.gguf_writer.add_add_eos_token(True)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("T5")
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
        self.gguf_writer.add_key_length(self.hparams["d_kv"])
        self.gguf_writer.add_value_length(self.hparams["d_kv"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
        self.gguf_writer.add_file_type(self.ftype)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
        # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
        # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
        if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
            logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
            return []
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("JAISLMHeadModel")
 class JaisModel(Model):
    model_arch = gguf.MODEL_ARCH.JAIS
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # SwigLU activation
        assert self.hparams["activation_function"] == "swiglu"
        # ALiBi position embedding
        assert self.hparams["position_embedding_type"] == "alibi"
        # Embeddings scale
        self.embeddings_scale = 1.0
        # note: For some JAIS flavors, output is tied to (same as) wte in original model
        self.output_is_wte = False
        if 'mup_embeddings_scale' in self.hparams:
            self.output_is_wte = True   # Hack (?)
            self.embeddings_scale = self.hparams['mup_embeddings_scale']
        elif 'embeddings_scale' in self.hparams:
            self.embeddings_scale = self.hparams['embeddings_scale']
        else:
            assert False
        self.width_scale = 1.0
        if 'mup_output_alpha' in self.hparams:
            assert 'mup_width_scale' in self.hparams
            self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
        elif 'width_scale' in self.hparams:
            self.width_scale = self.hparams['width_scale']
        else:
            assert False
        self.max_alibi_bias = 8.0
    def set_vocab(self):
        self._set_vocab_gpt2()
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        tensors: list[tuple[str, Tensor]] = []
        # we don't need these
        if name.endswith((".attn.bias")):
            return tensors
        if name.endswith(("relative_pe.slopes")):
            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
            # but Jais's PyTorch model simply precalculates the slope values and places them
            # in relative_pes.slopes
            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
            first_val = float(data_torch._data[0])
            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
            return tensors
        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
            data_torch = data_torch.transpose(1, 0)
        new_name = self.map_tensor_name(name)
        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
            tensors.append((new_name, data_torch * self.embeddings_scale))
            if self.output_is_wte:
                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
            assert not self.output_is_wte
            tensors.append((new_name, data_torch * self.width_scale))
        else:
            tensors.append((new_name, data_torch))
        return tensors
    def write_tensors(self):
        super().write_tensors()
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
 ###### CONVERSION LOGIC ######
@ -2810,10 +3146,44 @@ def parse_args() -> argparse.Namespace:
        "--verbose", action="store_true",
        help="increase output verbosity",
    )
    parser.add_argument(
        "--split-max-tensors", type=int, default=0,
        help="max tensors in each split",
    )
    parser.add_argument(
        "--split-max-size", type=str, default="0",
        help="max size per split N(M|G)",
    )
    parser.add_argument(
        "--dry-run", action="store_true",
        help="only print out a split plan and exit, without writing any new files",
    )
    parser.add_argument(
        "--no-tensor-first-split", action="store_true",
        help="do not add tensors to the first split (disabled by default)"
    )
    return parser.parse_args()
 def split_str_to_n_bytes(split_str: str) -> int:
    if split_str.endswith("K"):
        n = int(split_str[:-1]) * 1000
    elif split_str.endswith("M"):
        n = int(split_str[:-1]) * 1000 * 1000
    elif split_str.endswith("G"):
        n = int(split_str[:-1]) * 1000 * 1000 * 1000
    elif split_str.isnumeric():
        n = int(split_str)
    else:
        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
    if n < 0:
        raise ValueError(f"Invalid split size: {split_str}, must be positive")
    return n
 def main() -> None:
    args = parse_args()
@ -2846,6 +3216,11 @@ def main() -> None:
        "auto": gguf.LlamaFileType.GUESSED,
    }
    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
    if args.use_temp_file and is_split:
        logger.error("Error: Cannot use temp file when splitting")
        sys.exit(1)
    if args.outfile is not None:
        fname_out = args.outfile
    else:
@ -2863,7 +3238,10 @@ def main() -> None:
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
            sys.exit(1)
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
                                     args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                     small_first_shard=args.no_tensor_first_split)
        logger.info("Set model parameters")
        model_instance.set_gguf_parameters()
@ -2874,13 +3252,14 @@ def main() -> None:
        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
        if args.vocab_only:
-            logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
+            logger.info("Exporting model vocab...")
            model_instance.write_vocab()
            logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
        else:
-            logger.info(f"Exporting model to '{model_instance.fname_out}'")
+            logger.info("Exporting model...")
            model_instance.write()
-
+            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
-        logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
+            logger.info(f"Model successfully exported to {out_path}")
 if __name__ == '__main__':
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
 Makefile:
 ```bash
-make LLAMA_BLIS=1 -j
+make GGML_BLIS=1 -j
-# make LLAMA_BLIS=1 benchmark-matmult
+# make GGML_BLIS=1 llama-benchmark-matmult
 ```
 CMake:
@ -39,7 +39,7 @@ CMake:
 ```bash
 mkdir build
 cd build
-cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
 make -j
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -39,13 +39,13 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
-    if (LLAMA_RPC)
+    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
    if (LLAMA_BUILD_SERVER)
    add_subdirectory(server)
    endif()
-    if (LLAMA_SYCL)
+    if (GGML_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(save-load-state)
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@ -11,13 +11,16 @@ Related PRs:
 ```sh
 # CPU only
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
+./cvector-generator -m ./llama-3.Q4_K_M.gguf
 # With GPU
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
 # With advanced options
-./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
+./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
 # Using mean value instead of PCA
 ./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
 # To see help message
 ./cvector-generator -h
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
 <|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
 Example to use output file with `llama-cli`:
 (Tips: The control vector works better when apply to layers higher than 10)
 ```sh
 ./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
 ```
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -2,6 +2,7 @@
 #include "llama.h"
 #include "ggml.h"
 #include "pca.hpp"
 #include "mean.hpp"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    printf("\nexample usage:\n");
-    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
-    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
-    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
    printf("\n");
 }
@ -223,17 +225,20 @@ struct train_context {
    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
-    void build_v_diff() {
+    void build_v_diff(bool transpose) {
        printf("build_v_diff\n");
        for (int il = 0; il < n_layers - 1; il++) {
            auto & diff_tmp = v_diff_tmp[il];
            int n_elem = diff_tmp.size() / sizeof(float);
            GGML_ASSERT(n_elem % n_embd == 0);
            int n_rows = n_elem / n_embd;
-            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
+            struct ggml_tensor * diff = transpose
                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
            // copy data & transpose
            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
            if (transpose) {
                // copy data & transpose
                float * arr = (float *) diff_tmp.data();
                for (int ir = 0; ir < n_rows; ++ir) {
                    for (int ic = 0; ic < n_embd; ++ic) {
@ -241,6 +246,10 @@ struct train_context {
                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
                    }
                }
            } else {
                // only copy
                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
            }
            v_diff.push_back(diff);
            print_debug_tensor(diff);
            // free memory of diff_tmp
@ -263,8 +272,8 @@ struct tokenized_prompt {
    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
+        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
-        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
+        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
        padding_seq(ctx, tokens_pos, max_seq_len);
        padding_seq(ctx, tokens_neg, max_seq_len);
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
        fprintf(stderr, "must provide at least one prompt pair\n");
        return 1;
    }
-
+    ctx_train.positive_entries = positive_prompts;
-    // create templated prompts
+    ctx_train.negative_entries = negative_prompts;
    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
    auto format_template = [](std::string persona, std::string suffix) {
        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
        return persona + " " + suffix;
    };
    for (size_t i = 0; i < positive_prompts.size(); ++i) {
        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
            // TODO replicate the truncations done by the python implementation
            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
        }
    }
    return 0;
 }
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);
-    // prepare ctx_train for PCA
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
    ctx_train.build_v_diff();
    // prepare ctx_train for PCA
    ctx_train.build_v_diff(use_pca);
    if (use_pca) {
        // run PCA
        PCA::pca_params pca_params;
        pca_params.n_threads = params.n_threads;
        pca_params.n_batch = params.n_pca_batch;
        pca_params.n_iterations = params.n_pca_iterations;
        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    } else {
        // run mean
        mean::run(ctx_train.v_diff, ctx_train.v_final);
    }
    // write output vectors to gguf
    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@ -0,0 +1,48 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #include <string>
 #include <vector>
 #include <math.h>
 namespace mean {
 static void run(
        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
        const std::vector<struct ggml_tensor *> & v_output) {
    printf("%s: Running mean...\n", __func__);
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
        ggml_format_name(ctrl_out, "direction.%ld", il+1);
        // calculate mean vector
        struct ggml_tensor * t_layer = v_input[il];
        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
            float f = 0.0;
            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
            }
            f /= t_layer->ne[1];
            ggml_set_f32_1d(ctrl_out, ic, f);
        }
        // normalize output vector
        float norm = 0.0;
        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
            float f = ggml_get_f32_1d(ctrl_out, i);
            norm += f*f;
        }
        norm = sqrt(norm);
        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
            float f = ggml_get_f32_1d(ctrl_out, i);
            ggml_set_f32_1d(ctrl_out, i, f / norm);
        }
        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
    }
 }
 }
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@ -1 +1,4 @@
-[INST] Act like a person who is extremely sad. [/INST]
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
 <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
 <|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
 <|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -290,7 +290,7 @@ static void power_iteration(
        }
        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
-            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
    }
    // get output tensor
@ -298,6 +298,9 @@ static void power_iteration(
    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
    //print_debug_tensor(output);
    ggml_gallocr_free(allocr);
    // TODO @ngxson : The output vector is randomly inverted
    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
 }
 static void run_pca(
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@ -1 +1,4 @@
-[INST] Act like a person who is extremely happy. [/INST]
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
 <|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
 <|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
 <|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -19,3 +19,42 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
 ```
 The above command will output space-separated float values.
 ## extra parameters
 ### --embd-normalize $integer$
 | $integer$ | description         | formula |
 |-----------|---------------------|---------|
 | $-1$      | none                |
 | $0$       | max absolute int16  | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
 | $1$       | taxicab             | $\Large{x_i \over\sum \lvert x_i\rvert}$
 | $2$       | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
 | $>2$      | p-norm              | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
 ### --embd-output-format $'string'$
 | $'string'$ | description                  |  |
 |------------|------------------------------|--|
 | ''         | same as before               | (default)
 | 'array'    | single embeddings            | $[[x_1,...,x_n]]$
 |            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
 | 'json'     | openai style                 |
 | 'json+'    | add cosine similarity matrix |
 ### --embd-separator $"string"$
 | $"string"$   | |
 |--------------|-|
 | "\n"         | (default)
 | "<#embSep#>" | for exemple
 | "<#sep#>"    | other exemple
 ## examples
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
 ./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
 ### Windows:
 ```powershell
 embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -7,23 +7,30 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static std::vector<std::string> split_lines(const std::string & s) {
+static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
    std::string line;
    std::vector<std::string> lines;
-    std::stringstream ss(s);
+    size_t start = 0;
-    while (std::getline(ss, line)) {
+    size_t end = s.find(separator);
-        lines.push_back(line);
+
    while (end != std::string::npos) {
        lines.push_back(s.substr(start, end - start));
        start = end + separator.length();
        end = s.find(separator, start);
    }
    lines.push_back(s.substr(start)); // Add the last part
    return lines;
 }
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
+    size_t n_tokens = tokens.size();
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+    for (size_t i = 0; i < n_tokens; i++) {
        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_cache_clear(ctx);
@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        // try to get sequence embeddings - supported only when pooling_type is not NONE
        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
            embd = llama_get_embeddings_ith(ctx, i);
            if (embd == NULL) {
                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
                continue;
            }
        }
        float * out = output + batch.seq_id[i][0] * n_embd;
-        //TODO: I would also add a parameter here to enable normalization or not.
+        llama_embd_normalize(embd, out, n_embd, embd_norm);
        /*fprintf(stdout, "unnormalized_embedding:");
        for (int hh = 0; hh < n_embd; hh++) {
            fprintf(stdout, "%9.6f ", embd[hh]);
        }
        fprintf(stdout, "\n");*/
        llama_embd_normalize(embd, out, n_embd);
    }
 }
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
        return 1;
    }
    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
    }
    // split the prompt into lines
-    std::vector<std::string> prompts = split_lines(params.prompt);
+    std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
    // max batch size
    const uint64_t n_batch = params.n_batch;
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            llama_batch_clear(batch);
            p += s;
            s = 0;
@ -182,15 +183,20 @@ int main(int argc, char ** argv) {
    // final batch
    float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
    if (params.embd_out.empty()) {
        // print the first part of the embeddings or for a single prompt, the full embedding
        fprintf(stdout, "\n");
        for (int j = 0; j < n_prompts; j++) {
            fprintf(stdout, "embedding %d: ", j);
            for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                if (params.embd_normalize == 0) {
                    fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                } else {
                    fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                }
            }
            fprintf(stdout, "\n");
        }
@ -198,14 +204,58 @@ int main(int argc, char ** argv) {
        if (n_prompts > 1) {
            fprintf(stdout, "\n");
            printf("cosine similarity matrix:\n\n");
            for (int i = 0; i < n_prompts; i++) {
                fprintf(stdout, "%6.6s ", prompts[i].c_str());
            }
            fprintf(stdout, "\n");
            for (int i = 0; i < n_prompts; i++) {
                for (int j = 0; j < n_prompts; j++) {
                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                    fprintf(stdout, "%6.2f ", sim);
                }
                fprintf(stdout, "%1.10s", prompts[i].c_str());
                fprintf(stdout, "\n");
            }
        }
    }
    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
        const bool notArray = params.embd_out != "array";
        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
        for (int j = 0;;) { // at least one iteration (one prompt)
            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
            fprintf(stdout, "[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
                if (i < n_embd) fprintf(stdout, ","); else break;
            }
            fprintf(stdout, notArray ? "]\n    }" : "]");
            j++;
            if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
        }
        fprintf(stdout, notArray ? "\n  ]" : "]\n");
        if (params.embd_out == "json+" && n_prompts > 1) {
            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
            for (int i = 0;;) { // at least two iteration (n_prompts > 1)
                fprintf(stdout, "    [");
                for (int j = 0;;) { // at least two iteration (n_prompts > 1)
                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                    fprintf(stdout, "%6.2f", sim);
                    j++;
                    if (j < n_prompts) fprintf(stdout, ", "); else break;
                }
                fprintf(stdout, " ]");
                i++;
                if (i < n_prompts) fprintf(stdout, ",\n"); else break;
            }
            fprintf(stdout, "\n  ]");
        }
        if (notArray) fprintf(stdout, "\n}\n");
    }
    // clean up
    llama_print_timings(ctx);
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -101,7 +101,9 @@ int main(int argc, char** argv) {
    auto grammar = llama_grammar_init(
            grammar_rules.data(),
            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
+    if (grammar == nullptr) {
        throw std::runtime_error("Failed to initialize llama_grammar");
    }
    // Read the input file
    std::string input_str;
    {
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        // clear previous kv_cache values (irrelevant for embeddings)
        llama_kv_cache_clear(ctx);
        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);
        // run model
@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
    llama_token eos_token = llama_token_eos(mdl);
    llama_kv_cache_clear(ctx);
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);
    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
-    // create new context - set to embedding mode
+    // create generation context
    cparams.embeddings = true;
    llama_context * ctx = llama_new_context_with_model(mdl, cparams);
    // ### Embedding/Representation ###
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 ```bash
-LLAMA_CUDA=1 make -j
+GGML_CUDA=1 make -j
 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 ## Input Prompts
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -210,6 +210,7 @@ int main(int argc, char ** argv) {
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
    const int space_token = 29871;
@ -217,12 +218,13 @@ int main(int argc, char ** argv) {
        inp_sfx.erase(inp_sfx.begin());
    }
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    if (add_bos) {
        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
    }
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = inp_pfx;
+    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
    }
    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
    const llama_token middle_token = llama_token_middle(model);
    if (middle_token >= 0) {
@ -526,14 +528,14 @@ int main(int argc, char ** argv) {
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                if (add_bos) {
                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
                }
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = inp_pfx;
+                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                }
                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
                const llama_token middle_token = llama_token_middle(model);
                if (middle_token >= 0) {
                    embd_inp.push_back(middle_token);
                }
@ -657,4 +659,3 @@ int main(int argc, char ** argv) {
    return 0;
 }
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@ -3,7 +3,7 @@
 #! pip install pydantic
 #! python json-schema-pydantic-example.py
-from pydantic import BaseModel, TypeAdapter
+from pydantic import BaseModel, Extra, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@ -50,11 +50,16 @@ else:
 if __name__ == '__main__':
    class QAPair(BaseModel):
        class Config:
            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
        question: str
        concise_answer: str
        justification: str
        stars: Annotated[int, Field(ge=1, le=5)]
    class PyramidalSummary(BaseModel):
        class Config:
            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
        title: str
        summary: str
        question_answers: Annotated[List[QAPair], MinLen(2)]
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -4,8 +4,7 @@ import itertools
 import json
 import re
 import sys
-from typing import Any, Dict, List, Set, Tuple, Union
+from typing import Any, List, Optional, Set, Tuple, Union
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
@ -23,6 +22,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
    return f'({result})?' if min_items == 0 else result
 def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
    has_min = min_value != None
    has_max = max_value != None
    def digit_range(from_char: str, to_char: str):
        out.append("[")
        if from_char == to_char:
            out.append(from_char)
        else:
            out.append(from_char)
            out.append("-")
            out.append(to_char)
        out.append("]")
    def more_digits(min_digits: int, max_digits: int):
        out.append("[0-9]")
        if min_digits == max_digits and min_digits == 1:
            return
        out.append("{")
        out.append(str(min_digits))
        if max_digits != min_digits:
            out.append(",")
            if max_digits != sys.maxsize:
                out.append(str(max_digits))
        out.append("}")
    def uniform_range(from_str: str, to_str: str):
        i = 0
        while i < len(from_str) and from_str[i] == to_str[i]:
            i += 1
        if i > 0:
            out.append("\"")
            out.append(from_str[:i])
            out.append("\"")
        if i < len(from_str):
            if i > 0:
                out.append(" ")
            sub_len = len(from_str) - i - 1
            if sub_len > 0:
                from_sub = from_str[i+1:]
                to_sub = to_str[i+1:]
                sub_zeros = "0" * sub_len
                sub_nines = "9" * sub_len
                to_reached = False
                out.append("(")
                if from_sub == sub_zeros:
                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
                    out.append(" ")
                    more_digits(sub_len, sub_len)
                else:
                    out.append("[")
                    out.append(from_str[i])
                    out.append("] ")
                    out.append("(")
                    uniform_range(from_sub, sub_nines)
                    out.append(")")
                    if ord(from_str[i]) < ord(to_str[i]) - 1:
                        out.append(" | ")
                        if to_sub == sub_nines:
                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
                            to_reached = True
                        else:
                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
                        out.append(" ")
                        more_digits(sub_len, sub_len)
                if not to_reached:
                    out.append(" | ")
                    digit_range(to_str[i], to_str[i])
                    out.append(" ")
                    uniform_range(sub_zeros, to_sub)
                out.append(")")
            else:
                out.append("[")
                out.append(from_str[i])
                out.append("-")
                out.append(to_str[i])
                out.append("]")
    if has_min and has_max:
        if min_value < 0 and max_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
            out.append(")")
            return
        if min_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
            out.append(") | ")
            min_value = 0
        min_s = str(min_value)
        max_s = str(max_value)
        min_digits = len(min_s)
        max_digits = len(max_s)
        for digits in range(min_digits, max_digits):
            uniform_range(min_s, "9" * digits)
            min_s = "1" + "0" * digits
            out.append(" | ")
        uniform_range(min_s, max_s)
        return
    less_decimals = max(decimals_left - 1, 1)
    if has_min:
        if min_value < 0:
            out.append("\"-\" (")
            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
            out.append(") | [0] | [1-9] ")
            more_digits(0, decimals_left - 1)
        elif min_value == 0:
            if top_level:
                out.append("[0] | [1-9] ")
                more_digits(0, less_decimals)
            else:
                more_digits(1, decimals_left)
        elif min_value <= 9:
            c = str(min_value)
            range_start = '1' if top_level else '0'
            if c > range_start:
                digit_range(range_start, chr(ord(c) - 1))
                out.append(" ")
                more_digits(1, less_decimals)
                out.append(" | ")
            digit_range(c, "9")
            out.append(" ")
            more_digits(0, less_decimals)
        else:
            min_s = str(min_value)
            length = len(min_s)
            c = min_s[0]
            if c > "1":
                digit_range("1" if top_level else "0", chr(ord(c) - 1))
                out.append(" ")
                more_digits(length, less_decimals)
                out.append(" | ")
            digit_range(c, c)
            out.append(" (")
            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
            out.append(")")
            if c < "9":
                out.append(" | ")
                digit_range(chr(ord(c) + 1), "9")
                out.append(" ")
                more_digits(length - 1, less_decimals)
        return
    if has_max:
        if max_value >= 0:
            if top_level:
                out.append("\"-\" [1-9] ")
                more_digits(0, less_decimals)
                out.append(" | ")
            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
        else:
            out.append("\"-\" (")
            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
            out.append(")")
        return
    raise RuntimeError("At least one of min_value or max_value must be set")
 class BuiltinRule:
    def __init__(self, content: str, deps: list = None):
@ -68,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
 class SchemaConverter:
@ -112,6 +275,51 @@ class SchemaConverter:
        return ''.join(('(', *recurse(0), ')'))
    def _not_strings(self, strings):
        class TrieNode:
            def __init__(self):
                self.children = {}
                self.is_end_of_string = False
            def insert(self, string):
                node = self
                for c in string:
                    node = node.children.setdefault(c, TrieNode())
                node.is_end_of_string = True
        trie = TrieNode()
        for s in strings:
            trie.insert(s)
        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
        out = ['["] ( ']
        def visit(node):
            rejects = []
            first = True
            for c in sorted(node.children.keys()):
                child = node.children[c]
                rejects.append(c)
                if first:
                    first = False
                else:
                    out.append(' | ')
                out.append(f'[{c}]')
                if child.children:
                    out.append(f' (')
                    visit(child)
                    out.append(')')
                elif child.is_end_of_string:
                    out.append(f' {char_rule}+')
            if node.children:
                if not first:
                    out.append(' | ')
                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
        visit(trie)
        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
        return ''.join(out)
    def _add_rule(self, name, rule):
        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
        if esc_name not in self._rules or self._rules[esc_name] == rule:
@ -357,13 +565,13 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
        elif isinstance(schema_type, list):
-            return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
+            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
        elif 'enum' in schema:
-            rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
+            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
            return self._add_rule(rule_name, rule)
        elif schema_type in (None, 'object') and \
@ -394,7 +602,7 @@ class SchemaConverter:
                else:
                    add_component(t, is_required=True)
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
            items = schema.get('items') or schema['prefixItems']
@ -432,6 +640,24 @@ class SchemaConverter:
            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
        elif schema_type in (None, 'integer') and \
                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
            min_value = None
            max_value = None
            if 'minimum' in schema:
                min_value = schema['minimum']
            elif 'exclusiveMinimum' in schema:
                min_value = schema['exclusiveMinimum'] + 1
            if 'maximum' in schema:
                max_value = schema['maximum']
            elif 'exclusiveMaximum' in schema:
                max_value = schema['exclusiveMaximum'] - 1
            out = ["("]
            _generate_min_max_int(min_value, max_value, out)
            out.append(") space")
            return self._add_rule(rule_name, ''.join(out))
        elif (schema_type == 'object') or (len(schema) == 0):
            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
@ -450,7 +676,7 @@ class SchemaConverter:
                self._add_primitive(dep, dep_rule)
        return n
-    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
        prop_order = self._prop_order
        # sort by position in prop_order (if specified) then by original order
        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
@ -465,12 +691,16 @@ class SchemaConverter:
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]
-        if additional_properties == True or isinstance(additional_properties, dict):
+        if additional_properties is not None and additional_properties != False:
            sub_name = f'{name}{"-" if name else ""}additional'
-            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
+            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
                self._add_primitive('value', PRIMITIVE_RULES['value'])
            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
            prop_kv_rule_names["*"] = self._add_rule(
                f'{sub_name}-kv',
-                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                f'{key_rule} ":" space {value_rule}'
            )
            optional_props.append("*")
@ -485,15 +715,11 @@ class SchemaConverter:
            def get_recursive_refs(ks, first_is_optional):
                [k, *rest] = ks
                kv_rule_name = prop_kv_rule_names[k]
-                if k == '*':
+                comma_ref = f'( "," space {kv_rule_name} )'
-                    res = self._add_rule(
+                if first_is_optional:
-                        f'{name}{"-" if name else ""}additional-kvs',
+                    res = comma_ref + ('*' if k == '*' else '?')
                        f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
                    )
                elif first_is_optional:
                    res = f'( "," space {kv_rule_name} )?'
                else:
-                    res = kv_rule_name
+                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
                if len(rest) > 0:
                    res += ' ' + self._add_rule(
                        f'{name}{"-" if name else ""}{k}-rest',
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@ -1,55 +0,0 @@
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
 # Sets the minimum CMake version required for this project.
 cmake_minimum_required(VERSION 3.22.1)
 # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
 # Since this is the top level CMakeLists.txt, the project name is also accessible
 # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
 # build script scope).
 project("llama-android")
 ## Fetch latest llama.cpp from GitHub
 #include(FetchContent)
 #FetchContent_Declare(
 #        llama
 #        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
 #        GIT_TAG        master
 #)
 #
 ## Also provides "common"
 #FetchContent_MakeAvailable(llama)
 # llama.cpp CI uses the code from the current branch
 # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
 add_subdirectory(../../../../../../ build-llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
 # You can define multiple libraries, and CMake builds them for you.
 # Gradle automatically packages shared libraries with your APK.
 #
 # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
    # List C/C++ source files with relative paths to this CMakeLists.txt.
        llama-android.cpp)
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
    # List libraries link to the target library
    llama
    common
    android
    log)
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
-include(FetchContent)
+#include(FetchContent)
-FetchContent_Declare(
+#FetchContent_Declare(
-        llama
+#        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
+#        GIT_TAG        master
-)
+#)
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 #load local llama.cpp
 add_subdirectory(../../../../../../ build-llama)
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common/common.h"
+#include "common.h"
 // Write C++ code here.
 //
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -131,23 +131,30 @@ class LlamaState: ObservableObject {
        messageLog += "\(text)"
        Task.detached {
            while await llamaContext.n_cur < llamaContext.n_len {
                let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
+                await MainActor.run {
                    self.messageLog += "\(result)"
                }
            }
            let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
+            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
            await llamaContext.clear()
-        messageLog += """
+
            await MainActor.run {
                self.messageLog += """
                    \n
                    Done
                    Heat up took \(t_heat)s
                    Generated \(tokens_per_second) t/s\n
                    """
            }
        }
    }
    func bench() async {
        guard let llamaContext else {
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@ -194,7 +194,7 @@ llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ```
 ### run on Orin
 ### case 1
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            if (n < 32)
                hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            hparams.image_grid_pinpoints[0]=0;
        }
        try {
            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & e) {
+        } catch (std::runtime_error & /*e*/) {
            strcpy(hparams.mm_patch_merge_type, "flat");
        }
        try {
            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            hparams.image_crop_resolution = hparams.image_size;
        }
@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        try {
            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
+        } catch (const std::exception& /*e*/) {
            new_clip->has_class_embedding = false;
        }
@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_pre_norm = false;
        }
@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_post_norm = false;
        }
        try {
            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
+        } catch (std::exception & /*e*/) {
            new_clip->has_patch_bias = false;
        }
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        } catch(const std::exception& e) {
+        } catch(const std::exception& /*e*/) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }
@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                // Yi-type llava
                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // missing in Yi-type llava
                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                // Yi-type llava
                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & e) {  }
+            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@ -10,4 +10,3 @@ More info:
 https://github.com/ggerganov/llama.cpp/pull/4484
 https://github.com/ggerganov/llama.cpp/issues/4226
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@ -48,4 +48,3 @@
 build*/
 out/
 tmp/
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
    LOG_TEE("%s", text);
 }
 static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
    auto formatted = llama_chat_format_single(
        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
    return formatted;
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
    std::vector<llama_chat_msg> chat_msgs;
    g_model = &model;
    g_ctx = &ctx;
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
                __func__, n_ctx_train, n_ctx);
    }
    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
    // print system information
    {
        LOG_TEE("\n");
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_inp;
    {
        auto prompt = (params.conversation && params.enable_chat_template)
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
            LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
+            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
        } else {
            LOG("use session tokens\n");
            embd_inp = session_tokens;
        }
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
    // Should not run without any tokens
    if (embd_inp.empty()) {
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
    std::ostringstream output_ss;     g_output_ss     = &output_ss;
    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
    // the first thing we will do is to output the prompt, so set color accordingly
    console::set_display(console::prompt);
@ -793,11 +810,20 @@ int main(int argc, char ** argv) {
                        is_antiprompt = true;
                    }
                    if (params.enable_chat_template) {
                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    printf("\n");
                }
            }
            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
                auto id = llama_sampling_last(ctx_sampling);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");
@ -848,8 +874,13 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }
                    bool format_chat = params.conversation && params.enable_chat_template;
                    std::string user_inp = format_chat
                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -864,6 +895,9 @@ int main(int argc, char ** argv) {
                        output_ss << llama_token_to_piece(ctx, token);
                    }
                    // reset assistant message
                    assistant_ss.str("");
                    n_remain -= line_inp.size();
                    LOG("n_remain: %d\n", n_remain);
                } else {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) {
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
        if (params.kl_divergence) {
            params.n_parallel = 1;
        } else {
            // ensure there's at least enough seq_ids for HellaSwag
            params.n_parallel = std::max(4, params.n_parallel);
        }
    }
    if (params.ppl_stride > 0) {
@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;
    // ensure there's at least enough seq_ids for HellaSwag
    params.n_parallel = std::max(4, params.n_parallel);
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -16,37 +16,37 @@ struct quant_option {
 };
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
    { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
-    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
+    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M"                   },
    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization",             },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M",                  },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M",                  },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
    return chunks;
 }
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
+    size_t n_tokens = tokens.size();
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+    for (size_t i = 0; i < n_tokens; i++) {
        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
        return 1;
    }
    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
 ## Usage
-On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
+On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
 For example, to build the CUDA backend with RPC support:
 ```bash
 mkdir build-rpc-cuda
 cd build-rpc-cuda
-cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
+cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
 cmake --build . --config Release
 ```
@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
-On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
+On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
 ```bash
 mkdir build-rpc
 cd build-rpc
-cmake .. -DLLAMA_RPC=ON
+cmake .. -DGGML_RPC=ON
 cmake --build . --config Release
 ```
--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@ -31,4 +31,3 @@ for i in range(n-1):
        embedding2 = np.array(result[j])
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        print(f"Similarity between {i} and {j}: {similarity:.2f}")
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,7 +1,14 @@
 set(TARGET llama-server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 if (MINGW)
    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
 endif()
 set(TARGET_SRCS
    server.cpp
    utils.hpp
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
    prompt-formats.js
    json-schema-to-grammar.mjs
 )
 foreach(asset ${PUBLIC_ASSETS})
    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
    )
 endforeach()
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
    target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
 endif()
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-fa`, `--flash-attn` : enable flash attention (default: disabled).
 - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
 - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
 - `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@ -634,12 +634,12 @@ return html`
      <div>
        <div class="grammar">
          <label for="template"></label>
-          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
        </div>
        <div class="grammar-columns">
          <div class="json-schema-controls">
            <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
+            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
          </div>
        </div>
      </div>
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  return minItems === 0 ? `(${result})?` : result;
 }
 function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
  const hasMin = minValue !== null;
  const hasMax = maxValue !== null;
  function digitRange(fromChar, toChar) {
      out.push("[");
      if (fromChar === toChar) {
          out.push(fromChar);
      } else {
          out.push(fromChar);
          out.push("-");
          out.push(toChar);
      }
      out.push("]");
  }
  function moreDigits(minDigits, maxDigits) {
      out.push("[0-9]");
      if (minDigits === maxDigits && minDigits === 1) {
          return;
      }
      out.push("{");
      out.push(minDigits.toString());
      if (maxDigits !== minDigits) {
          out.push(",");
          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
              out.push(maxDigits.toString());
          }
      }
      out.push("}");
  }
  function uniformRange(fromStr, toStr) {
      let i = 0;
      while (i < fromStr.length && fromStr[i] === toStr[i]) {
          i++;
      }
      if (i > 0) {
          out.push("\"");
          out.push(fromStr.slice(0, i));
          out.push("\"");
      }
      if (i < fromStr.length) {
          if (i > 0) {
              out.push(" ");
          }
          const subLen = fromStr.length - i - 1;
          if (subLen > 0) {
              const fromSub = fromStr.slice(i + 1);
              const toSub = toStr.slice(i + 1);
              const subZeros = "0".repeat(subLen);
              const subNines = "9".repeat(subLen);
              let toReached = false;
              out.push("(");
              if (fromSub === subZeros) {
                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
                  out.push(" ");
                  moreDigits(subLen, subLen);
              } else {
                  out.push("[");
                  out.push(fromStr[i]);
                  out.push("] ");
                  out.push("(");
                  uniformRange(fromSub, subNines);
                  out.push(")");
                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
                      out.push(" | ");
                      if (toSub === subNines) {
                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
                          toReached = true;
                      } else {
                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
                      }
                      out.push(" ");
                      moreDigits(subLen, subLen);
                  }
              }
              if (!toReached) {
                  out.push(" | ");
                  digitRange(toStr[i], toStr[i]);
                  out.push(" ");
                  uniformRange(subZeros, toSub);
              }
              out.push(")");
          } else {
              out.push("[");
              out.push(fromStr[i]);
              out.push("-");
              out.push(toStr[i]);
              out.push("]");
          }
      }
  }
  if (hasMin && hasMax) {
      if (minValue < 0 && maxValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
          out.push(")");
          return;
      }
      if (minValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
          out.push(") | ");
          minValue = 0;
      }
      let minS = minValue.toString();
      const maxS = maxValue.toString();
      const minDigits = minS.length;
      const maxDigits = maxS.length;
      for (let digits = minDigits; digits < maxDigits; digits++) {
          uniformRange(minS, "9".repeat(digits));
          minS = "1" + "0".repeat(digits);
          out.push(" | ");
      }
      uniformRange(minS, maxS);
      return;
  }
  const lessDecimals = Math.max(decimalsLeft - 1, 1);
  if (hasMin) {
      if (minValue < 0) {
          out.push("\"-\" (");
          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
          out.push(") | [0] | [1-9] ");
          moreDigits(0, decimalsLeft - 1);
      } else if (minValue === 0) {
          if (topLevel) {
              out.push("[0] | [1-9] ");
              moreDigits(0, lessDecimals);
          } else {
              moreDigits(1, decimalsLeft);
          }
      } else if (minValue <= 9) {
          const c = minValue.toString();
          const range_start = topLevel ? '1' : '0';
          if (c > range_start) {
              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
              out.push(" ");
              moreDigits(1, lessDecimals);
              out.push(" | ");
          }
          digitRange(c, "9");
          out.push(" ");
          moreDigits(0, lessDecimals);
      } else {
          const minS = minValue.toString();
          const length = minS.length;
          const c = minS[0];
          if (c > "1") {
              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
              out.push(" ");
              moreDigits(length, lessDecimals);
              out.push(" | ");
          }
          digitRange(c, c);
          out.push(" (");
          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
          out.push(")");
          if (c < "9") {
              out.push(" | ");
              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
              out.push(" ");
              moreDigits(length - 1, lessDecimals);
          }
      }
      return;
  }
  if (hasMax) {
      if (maxValue >= 0) {
          if (topLevel) {
              out.push("\"-\" [1-9] ");
              moreDigits(0, lessDecimals);
              out.push(" | ");
          }
          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
      } else {
          out.push("\"-\" (");
          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
          out.push(")");
      }
      return;
  }
  throw new Error("At least one of minValue or maxValue must be set");
 }
 class BuiltinRule {
  constructor(content, deps) {
    this.content = content;
@ -64,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
 export class SchemaConverter {
  constructor(options) {
@ -337,6 +532,64 @@ export class SchemaConverter {
    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
  }
  _notStrings(strings) {
    class TrieNode {
      constructor() {
        this.children = {};
        this.isEndOfString = false;
      }
      insert(str) {
        let node = this;
        for (const c of str) {
          node = node.children[c] = node.children[c] || new TrieNode();
        }
        node.isEndOfString = true;
      }
    }
    const trie = new TrieNode();
    for (const s of strings) {
      trie.insert(s);
    }
    const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
    const out = ['["] ( '];
    const visit = (node) => {
      const rejects = [];
      let first = true;
      for (const c of Object.keys(node.children).sort()) {
        const child = node.children[c];
        rejects.push(c);
        if (first) {
          first = false;
        } else {
          out.push(' | ');
        }
        out.push(`[${c}]`);
        if (Object.keys(child.children).length > 0) {
          out.push(' (');
          visit(child);
          out.push(')');
        } else if (child.isEndOfString) {
          out.push(` ${charRuleName}+`);
        }
      }
      if (Object.keys(node.children).length > 0) {
        if (!first) {
          out.push(' | ');
        }
        out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
      }
    };
    visit(trie);
    out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
    return out.join('');
  }
  _resolveRef(ref) {
    let refName = ref.split('/').pop();
    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
@ -363,11 +616,11 @@ export class SchemaConverter {
    } else if (schema.oneOf || schema.anyOf) {
      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
    } else if (Array.isArray(schemaType)) {
-      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
    } else if ('const' in schema) {
-      return this._addRule(ruleName, this._generateConstantRule(schema.const));
+      return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
    } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
+      const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
      return this._addRule(ruleName, rule);
    } else if ((schemaType === undefined || schemaType === 'object') &&
               ('properties' in schema ||
@ -404,7 +657,7 @@ export class SchemaConverter {
        }
      }
-      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
      const items = schema.items ?? schema.prefixItems;
      if (Array.isArray(items)) {
@ -435,6 +688,24 @@ export class SchemaConverter {
      const minLen = schema.minLength || 0;
      const maxLen = schema.maxLength;
      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
      let minValue = null;
      let maxValue = null;
      if ('minimum' in schema) {
        minValue = schema.minimum;
      } else if ('exclusiveMinimum' in schema) {
        minValue = schema.exclusiveMinimum + 1;
      }
      if ('maximum' in schema) {
        maxValue = schema.maximum;
      } else if ('exclusiveMaximum' in schema) {
        maxValue = schema.exclusiveMaximum - 1;
      }
      const out = ["("];
      _generateMinMaxInt(minValue, maxValue, out);
      out.push(") space");
      return this._addRule(ruleName, out.join(''));
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
    } else {
@ -480,12 +751,19 @@ export class SchemaConverter {
    const requiredProps = sortedProps.filter(k => required.has(k));
    const optionalProps = sortedProps.filter(k => !required.has(k));
-    if (typeof additionalProperties === 'object' || additionalProperties === true) {
+    if (additionalProperties) {
      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
-      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
+      const valueRule =
        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
        : this._addPrimitive('value', PRIMITIVE_RULES['value']);
      const key_rule =
        sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
        : this._addRule(`${subName}-k`, this._notStrings(sortedProps));
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${key_rule} ":" space ${valueRule}`);
      optionalProps.push('*');
    }
@ -502,15 +780,11 @@ export class SchemaConverter {
        const [k, ...rest] = ks;
        const kvRuleName = propKvRuleNames[k];
        let res;
-        if (k === '*') {
+        const commaRef = `( "," space ${kvRuleName} )`;
-            res = this._addRule(
+        if (firstIsOptional) {
-                `${name ?? ''}${name ? '-' : ''}additional-kvs`,
+          res = commaRef + (k === '*' ? '*' : '?');
                `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
            )
        } else if (firstIsOptional) {
          res = `( "," space ${kvRuleName} )?`;
        } else {
-          res = kvRuleName;
+          res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
        }
        if (rest.length > 0) {
          res += ' ' + this._addRule(
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -3,6 +3,13 @@
 by Humans for All.
 ## quickstart
 To run from the build dir
 bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
 Continue reading for the details.
 ## overview
@ -14,6 +21,8 @@ own system prompts.
 This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
 or potentially as it is being generated, in a streamed manner from the server/ai-model.
 ![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
 Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
 open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
    The histogram/freq based trimming logic is currently tuned for english language wrt its
    is-it-a-alpabetic|numeral-char regex match logic.
-  chatRequestOptions - maintains the list of options/fields to send along with chat request,
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
  irrespective of whether /chat/completions or /completions endpoint.
    If you want to add additional options/fields to send to the server/ai-model, and or
    modify the existing options value or remove them, for now you can update this global var
    using browser's development-tools/console.
-    For string and numeric fields in chatRequestOptions, including even those added by a user
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
-    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
    created.
    cache_prompt option supported by example/server is allowed to be controlled by user, so that
    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
    However system prompt should ideally get the benefit of caching.
  headers - maintains the list of http headers sent when request is made to the server. By default
  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
  be set if needed using the settings ui.
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
-By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
-implications of loading of the ai-model's context window by chat history, wrt chat response to
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when
+some extent in a simple crude way. You may also want to control the context size enabled when the
-the server loads ai-model, on the server end.
+server loads ai-model, on the server end.
 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
  to /completions endpoint handling code on server side.
-NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
-wrt the set of fields sent to server along with the user query. To check how the model behaves
+wrt the set of fields sent to server along with the user query, to check how the model behaves
 wrt repeatations in general in the generated text response.
 A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the providing settings ui.
+using the provided settings ui (for settings exposed through the ui).
 ### OpenAi / Equivalent API WebService
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
 * the baseUrl in settings ui
  * https://api.openai.com/v1 or similar
-* Wrt request body - gMe.chatRequestOptions
+* Wrt request body - gMe.apiRequestOptions
  * model (settings ui)
  * any additional fields if required in future
--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -222,8 +222,8 @@ class SimpleChat {
     * @param {Object} obj
     */
    request_jsonstr_extend(obj) {
-        for(let k in gMe.chatRequestOptions) {
+        for(let k in gMe.apiRequestOptions) {
-            obj[k] = gMe.chatRequestOptions[k];
+            obj[k] = gMe.apiRequestOptions[k];
        }
        if (gMe.bStream) {
            obj["stream"] = true;
@ -740,11 +740,12 @@ class Me {
            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
        }
        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
-        this.chatRequestOptions = {
+        this.apiRequestOptions = {
            "model": "gpt-3.5-turbo",
            "temperature": 0.7,
            "max_tokens": 1024,
            "n_predict": 1024,
            "cache_prompt": false,
            //"frequency_penalty": 1.2,
            //"presence_penalty": 1.2,
        };
@ -800,51 +801,55 @@ class Me {
            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
        }
-        ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
    }
    /**
-     * Auto create ui input elements for fields in ChatRequestOptions
+     * Auto create ui input elements for fields in apiRequestOptions
     * Currently supports text and number field types.
     * @param {HTMLDivElement} elDiv
     */
-    show_settings_chatrequestoptions(elDiv) {
+    show_settings_apirequestoptions(elDiv) {
        let typeDict = {
            "string": "text",
            "number": "number",
        };
        let fs = document.createElement("fieldset");
        let legend = document.createElement("legend");
-        legend.innerText = "ChatRequestOptions";
+        legend.innerText = "ApiRequestOptions";
        fs.appendChild(legend);
        elDiv.appendChild(fs);
-        for(const k in this.chatRequestOptions) {
+        for(const k in this.apiRequestOptions) {
-            let val = this.chatRequestOptions[k];
+            let val = this.apiRequestOptions[k];
            let type = typeof(val);
-            if (!((type == "string") || (type == "number"))) {
+            if (((type == "string") || (type == "number"))) {
-                continue;
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
            }
            let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
                    if (type == "number") {
                        val = Number(val);
                    }
-                this.chatRequestOptions[k] = val;
+                    this.apiRequestOptions[k] = val;
                });
                fs.appendChild(inp.div);
            } else if (type == "boolean") {
                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
                    this.apiRequestOptions[k] = userVal;
                });
                fs.appendChild(bbtn.div);
            }
        }
    }
@ -870,6 +875,23 @@ class Me {
        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
            this.bTrimGarbage = val;
        });
        elDiv.appendChild(bb.div);
        this.show_settings_apirequestoptions(elDiv);
        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
            this.apiEP = ApiEP.Type[val];
        });
        elDiv.appendChild(sel.div);
        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
        });
        elDiv.appendChild(sel.div);
        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
            this.bCompletionFreshChatAlways = val;
        });
@ -880,23 +902,6 @@ class Me {
        });
        elDiv.appendChild(bb.div);
        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
            this.bTrimGarbage = val;
        });
        elDiv.appendChild(bb.div);
        let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
        });
        elDiv.appendChild(sel.div);
        sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
            this.apiEP = ApiEP.Type[val];
        });
        elDiv.appendChild(sel.div);
        this.show_settings_chatrequestoptions(elDiv);
    }
 }
--- a/examples/server/public_simplechat/simplechat_screens.webp
+++ b/examples/server/public_simplechat/simplechat_screens.webp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2020,6 +2020,7 @@ struct server_context {
                        slot.t_start_generation = 0;
                        if (slot.infill) {
                            const bool add_bos = llama_should_add_bos_token(model);
                            bool suff_rm_leading_spc = true;
                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                params.input_suffix.erase(0, 1);
@ -2035,16 +2036,21 @@ struct server_context {
                            }
                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
+
-                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
                            if (add_bos) {
                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
                            }
                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
                            const llama_token middle_token = llama_token_middle(model);
                            if (middle_token >= 0) {
-                                prefix_tokens.push_back(middle_token);
+                                embd_inp.push_back(middle_token);
                            }
-                            prompt_tokens = prefix_tokens;
+                            prompt_tokens = embd_inp;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }
@ -2606,16 +2612,8 @@ int main(int argc, char ** argv) {
    // print sample chat example to make it clear which template is used
    {
        json chat;
        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
        chat.push_back({{"role", "user"},      {"content", "Hello"}});
        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
        LOG_INFO("chat template", {
-            {"chat_example", chat_example},
+            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
            {"built_in",     params.chat_template.empty()},
        });
    }
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift
      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
      # 987           |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -82,7 +82,7 @@ Feature: llama.cpp server
    Examples: Prompts
      | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
+      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@ -1054,4 +1054,3 @@
 </body>
 </html>
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@ -1058,4 +1058,3 @@
 </body>
 </html>
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
+    std::vector<llama_chat_msg> chat;
    // vector holding all allocated string to be passed to llama_chat_apply_template
    std::vector<std::string> str(messages.size() * 2);
    std::vector<llama_chat_message> chat(messages.size());
    for (size_t i = 0; i < messages.size(); ++i) {
        const auto & curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
+        std::string role    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
+        std::string content = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
+        chat.push_back({role, content});
        chat[i].role    = str[i*2 + 0].c_str();
        chat[i].content = str[i*2 + 1].c_str();
    }
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
    std::vector<char> buf(alloc_size * 2);
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
    }
    const std::string formatted_chat(buf.data(), res);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -34,4 +34,3 @@ fi
 #use multiple GPUs with same max compute units
 #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
 ::  for FP32
-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
 ::  build all binary
-make -j
+cmake --build . -j
 if %errorlevel% neq 0 goto ERROR
 cd ..
@ -31,4 +31,3 @@ exit /B 0
 :ERROR
 echo comomand error: %errorlevel%
 exit /B %errorlevel%
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1718318537,
+        "lastModified": 1719506693,
-        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
+        "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
+        "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
        "type": "github"
      },
      "original": {
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -0,0 +1,240 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("ggml" C CXX)
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(GGML_STANDALONE ON)
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
    # configure project version
    # TODO
 else()
    set(GGML_STANDALONE OFF)
 endif()
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)
    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
    else()
        set(BUILD_SHARED_LIBS_DEFAULT ON)
    endif()
 endif()
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 #
 # option list
 #
 # TODO: mark all options as advanced when not GGML_STANDALONE
 if (APPLE)
    set(GGML_METAL_DEFAULT ON)
    set(GGML_BLAS_DEFAULT ON)
    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
 else()
    set(GGML_METAL_DEFAULT OFF)
    set(GGML_BLAS_DEFAULT OFF)
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
 option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
 option(GGML_LTO    "ggml: enable link time optimization" OFF)
 option(GGML_CCACHE "ggml: use ccache if available"       ON)
 # debug
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
 option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
 option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)
 # build
 option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)
 # sanitizers
 option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
 option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 # instruction set specific
 if (GGML_NATIVE)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
 endif()
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
 option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
 option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
 if (NOT MSVC)
    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
 endif()
 option(GGML_LASX        "ggml: enable lasx"             ON)
 option(GGML_LSX         "ggml: enable lsx"              ON)
 option(GGML_SVE         "ggml: enable SVE"              OFF)
 if (WIN32)
    set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
 endif()
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
 option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
 option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                            "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 #
 # dependencies
 #
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 if (GGML_SYCL)
    set(CMAKE_CXX_STANDARD 17)
 else()
    set(CMAKE_CXX_STANDARD 11)
 endif()
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 #
 # build the library
 #
 add_subdirectory(src)
 #
 # tests and examples
 #
 if (GGML_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
 endif ()
 if (GGML_BUILD_EXAMPLES)
    add_subdirectory(examples)
 endif ()
 #
 # install
 #
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-alloc.h
    include/ggml-backend.h
    "${GGML_HEADERS_CUDA}"
    "${GGML_HEADERS_METAL}"
    "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
 install(TARGETS ggml PUBLIC_HEADER)
 if (BUILD_SHARED_LIBS)
    install(TARGETS ggml LIBRARY)
 endif()
 if (GGML_METAL)
    install(
        FILES src/ggml-metal.metal
        PERMISSIONS
            OWNER_READ
            OWNER_WRITE
            GROUP_READ
            WORLD_READ
        DESTINATION ${CMAKE_INSTALL_BINDIR})
    if (NOT GGML_METAL_EMBED_LIBRARY)
        install(
            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            DESTINATION ${CMAKE_INSTALL_BINDIR}
        )
    endif()
 endif()
 if (GGML_STANDALONE)
    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        DESTINATION share/pkgconfig)
 endif()
--- a/ggml/cmake/FindSIMD.cmake
+++ b/ggml/cmake/FindSIMD.cmake
@ -79,22 +79,22 @@ endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
-    set(LLAMA_AVX OFF)
+    set(GGML_AVX OFF)
 else()
-    set(LLAMA_AVX ON)
+    set(GGML_AVX ON)
 endif()
 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(LLAMA_AVX2 OFF)
+    set(GGML_AVX2 OFF)
 else()
-    set(LLAMA_AVX2 ON)
+    set(GGML_AVX2 ON)
 endif()
 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
-    set(LLAMA_AVX512 OFF)
+    set(GGML_AVX512 OFF)
 else()
-    set(LLAMA_AVX512 ON)
+    set(GGML_AVX512 ON)
 endif()
--- a/ggml/ggml_vk_generate_shaders.py
+++ b/ggml/ggml_vk_generate_shaders.py
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
--- a/Show more
+++ b/Show more
`@ -10,4 +10,3 @@ More info:`

	`https://github.com/ggerganov/llama.cpp/pull/4484`	`https://github.com/ggerganov/llama.cpp/pull/4484`
	`https://github.com/ggerganov/llama.cpp/issues/4226`	`https://github.com/ggerganov/llama.cpp/issues/4226`
`@ -34,4 +34,3 @@ fi`

	`#use multiple GPUs with same max compute units`	`#use multiple GPUs with same max compute units`
	`#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0`	`#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0`
`@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"`


	`.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0`	`.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0`