Merge branch 'master' into last-working-old-file-format

2023-09-04 10:26:17 +01:00 · 2023-09-04 10:26:17 +01:00 · 416f98789b
commit 416f98789b
parent d2a4c682ef e4386f417f
129 changed files with 23447 additions and 9146 deletions
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -0,0 +1,44 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -0,0 +1,84 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp-clblast
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        OpenCL Inference of LLaMA model in C/C++
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
 Requires:       clblast
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j LLAMA_CLBLAST=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p main %{buildroot}%{_bindir}/llamaclblast
 cp -p server %{buildroot}%{_bindir}/llamaclblastserver
 cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llamaclblast
 %{_bindir}/llamaclblastserver
 %{_bindir}/llamaclblastsimple
 /usr/lib/systemd/system/llamaclblast.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@ -0,0 +1,83 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp-cublas
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j LLAMA_CUBLAS=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p main %{buildroot}%{_bindir}/llamacppcublas
 cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
 cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llamacppcublas
 %{_bindir}/llamacppcublasserver
 %{_bindir}/llamacppcublassimple
 /usr/lib/systemd/system/llamacublas.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -0,0 +1,85 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 #    In the meantime, YYYYMMDD format will be used.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 Models are not included in this package and must be downloaded separately.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p main %{buildroot}%{_bindir}/llama
 cp -p server %{buildroot}%{_bindir}/llamaserver
 cp -p simple %{buildroot}%{_bindir}/llamasimple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llama
 %{_bindir}/llamaserver
 %{_bindir}/llamasimple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -0,0 +1,44 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt requirements.txt
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/main" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -7,15 +7,12 @@ arg1="$1"
 # Shift the arguments to remove the first one
 shift
 # Join the remaining arguments into a single string
 arg2="$@"
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$arg2"
+    python3 ./convert.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$arg2"
+    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$arg2"
+    ./main "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -27,7 +24,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$arg2"
+    ./server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
--- a/.dockerignore
+++ b/.dockerignore
@ -5,14 +5,7 @@
 .vscode/
 .DS_Store
-build/
+build*/
 build-em/
 build-debug/
 build-release/
 build-static/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 models/*
--- a/.editorconfig
+++ b/.editorconfig
@ -17,3 +17,6 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -18,7 +18,6 @@ on:
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_NITER: 1
  GGML_N_THREADS: 1
 jobs:
@ -41,6 +40,12 @@ jobs:
        run: |
          CC=gcc-8 make
      - name: Test
        id: make_test
        run: |
          CC=gcc-8 make tests
          make test
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@ -157,6 +162,12 @@ jobs:
        run: |
          make
      - name: Test
        id: make_test
        run: |
          make tests
          make test
  macOS-latest-cmake:
    runs-on: macos-latest
@ -291,24 +302,32 @@ jobs:
          cd build
          ctest -C Release --verbose --timeout 900
-      - name: Get commit hash
+      - name: Determine tag name
-        id: commit
+        id: tag
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        shell: bash
-        uses: pr-mpt/actions-commit-hash@v2
+        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
  windows-latest-cmake-cublas:
    runs-on: windows-latest
@ -338,23 +357,31 @@ jobs:
          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release
-      - name: Get commit hash
+      - name: Determine tag name
-        id: commit
+        id: tag
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        shell: bash
-        uses: pr-mpt/actions-commit-hash@v2
+        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
      - name: Copy and pack Cuda runtime
        if: ${{ matrix.cuda == '12.1.0' }}
@ -400,21 +427,34 @@ jobs:
      - windows-latest-cmake-cublas
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v3
      - name: Get commit hash
        id: commit
        uses: pr-mpt/actions-commit-hash@v2
      - name: Create release
        id: create_release
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ steps.tag.outputs.name }}
      - name: Upload release
        id: upload_release
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -0,0 +1,36 @@
 name: Code Coverage
 on: [push, pull_request]
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
      - name: Run tests
        run: CC=gcc-8 make test
      - name: Generate coverage report
        run: |
          make coverage
          make lcov-report
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: lcov-report/coverage.info
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -0,0 +1,43 @@
 # This workflow will upload a Python Package using Twine when a GGUF release is created
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 # See `gguf-py/README.md` for how to make a release.
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 name: Upload Python Package
 on:
  workflow_dispatch:
  push:
    # Pattern matched against refs/tags
    tags:
      - 'gguf-v*'           # Push events to every version tag
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
        python -m pip install poetry
        poetry install
    - name: Build package
      run: poetry build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,15 @@
 *.o
 *.a
 *.so
 *.gguf
 *.bin
 *.exe
 *.dll
 *.log
 *.gcov
 *.gcno
 *.gcda
 *.dot
 .DS_Store
 .build/
 .cache/
@ -13,20 +21,10 @@
 .vs/
 .vscode/
-build/
+lcov-report/
-build-em/
+gcovr-report/
-build-debug/
+
-build-release/
+build*/
 build-ci-debug/
 build-ci-release/
 build-static/
 build-cublas/
 build-opencl/
 build-metal/
 build-mpi/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 out/
 tmp/
@ -47,14 +45,20 @@ models-mnt
 /server
 /Pipfile
 /embd-input-test
 /gguf
 /gguf-llama-simple
 /libllama.so
 /llama-bench
 /baby-llama
 /beam-search
 /save-load-state
 build-info.h
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
 __pycache__
 dist
 zig-out/
 zig-cache/
@ -65,18 +69,18 @@ perf-*.txt
 examples/jeopardy/results.txt
 pyproject.toml
 poetry.lock
 poetry.toml
 # Test binaries
 tests/test-grammar-parser
 tests/test-llama-grammar
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
 tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
-tests/test-tokenizer-0
+tests/test-tokenizer-0-llama
-
+tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kern
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
@ -300,7 +301,7 @@ if (LLAMA_METAL)
    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
    add_compile_definitions(GGML_USE_METAL)
-    add_compile_definitions(GGML_METAL_NDEBUG)
+    #add_compile_definitions(GGML_METAL_NDEBUG)
    # get full path to the file
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
    endif()
 endif()
 if (LLAMA_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
    find_package(hip)
    find_package(hipblas)
    find_package(rocblas)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
        if (LLAMA_STATIC)
            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
    else()
        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@ -364,6 +402,8 @@ if (LLAMA_ALL_WARNINGS)
            -Wstrict-prototypes
            -Wpointer-arith
            -Wmissing-prototypes
            -Werror=implicit-int
            -Wno-unused-function
        )
        set(cxx_flags
            -Wall
@ -373,6 +413,10 @@ if (LLAMA_ALL_WARNINGS)
            -Wno-unused-function
            -Wno-multichar
        )
        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
            # g++ only
            set(cxx_flags ${cxx_flags} -Wno-format-truncation)
        endif()
    else()
        # todo : msvc
    endif()
@ -503,9 +547,11 @@ else()
 endif()
 #
-# Build libraries
+# libraries
 #
 # ggml
 add_library(ggml OBJECT
            ggml.c
            ggml.h
@ -530,10 +576,11 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS ggml_shared LIBRARY)
 endif()
 # llama
 add_library(llama
            llama.cpp
            llama.h
            llama-util.h
            )
 target_include_directories(llama PUBLIC .)
@ -552,6 +599,10 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS llama LIBRARY)
 endif()
 #
 # install
 #
 include(GNUInstallDirs)
 install(
    FILES convert.py
@ -590,6 +641,8 @@ endif()
 # programs, examples and tests
 #
 add_subdirectory(common)
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
--- a/303
+++ b/303
@ -1,11 +1,43 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o
 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
 default: $(BUILD_TARGETS)
 test:
 	@echo "Running tests..."
 	@for test_target in $(TEST_TARGETS); do \
 		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
 			continue; \
 		else \
 			./$$test_target; \
 		fi; \
 	done
 	@echo "All tests have been run."
 all: $(BUILD_TARGETS) $(TEST_TARGETS)
 coverage: ## Run code coverage
 	gcov -pb tests/*.cpp
 lcov-report: coverage ## Generate lcov report
 	mkdir -p lcov-report
 	lcov --capture --directory . --output-file lcov-report/coverage.info
 	genhtml lcov-report/coverage.info --output-directory lcov-report
 gcovr-report: coverage ## Generate gcovr report
 	mkdir -p gcovr-report
 	gcovr --root . --html --html-details --output gcovr-report/coverage.html
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@ -18,6 +50,11 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif
 CCV := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)
@ -45,53 +82,48 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
+MK_CPPFLAGS = -I. -Icommon
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+MK_CFLAGS   = $(CPPFLAGS) $(OPT) -std=c11   -fPIC
-LDFLAGS  =
+MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
 MK_LDFLAGS  =
 ifdef LLAMA_DEBUG
-	CFLAGS   += -O0 -g
+	MK_CFLAGS   += -O0 -g
-	CXXFLAGS += -O0 -g
+	MK_CXXFLAGS += -O0 -g
-	LDFLAGS  += -g
+	MK_LDFLAGS  += -g
 else
-	CFLAGS   += -DNDEBUG
+	MK_CPPFLAGS += -DNDEBUG
 	CXXFLAGS += -DNDEBUG
 endif
 ifdef LLAMA_SERVER_VERBOSE
-	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
 ifdef LLAMA_CODE_COVERAGE
 	CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
 endif
 ifdef LLAMA_DISABLE_LOGS
 	CFLAGS   += -DLOG_DISABLE_LOGS
 	CXXFLAGS += -DLOG_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-			-Wmissing-prototypes
+				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
+MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
 ifeq '' '$(findstring clang++,$(CXX))'
 	# g++ only
 	CXXFLAGS += -Wno-format-truncation
 endif
 # OS specific
 # TODO: support Windows
-ifeq ($(UNAME_S),Linux)
+ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
-	CFLAGS   += -pthread
+	MK_CFLAGS   += -pthread
-	CXXFLAGS += -pthread
+	MK_CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),NetBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),OpenBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),Haiku)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 # detect Windows
@ -117,72 +149,84 @@ ifeq ($(_WIN32),1)
 endif
 ifdef LLAMA_GPROF
-	CFLAGS   += -pg
+	MK_CFLAGS   += -pg
-	CXXFLAGS += -pg
+	MK_CXXFLAGS += -pg
 endif
 ifdef LLAMA_PERF
-	CFLAGS   += -DGGML_PERF
+	MK_CPPFLAGS += -DGGML_PERF
 	CXXFLAGS += -DGGML_PERF
 endif
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifndef RISCV
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	CFLAGS   += -march=native -mtune=native
+	MK_CFLAGS   += -march=native -mtune=native
-	CXXFLAGS += -march=native -mtune=native
+	MK_CXXFLAGS += -march=native -mtune=native
 	# Usage AVX-only
-	#CFLAGS   += -mfma -mf16c -mavx
+	#MK_CFLAGS   += -mfma -mf16c -mavx
-	#CXXFLAGS += -mfma -mf16c -mavx
+	#MK_CXXFLAGS += -mfma -mf16c -mavx
 	# Usage SSSE3-only (Not is SSE3!)
-	#CFLAGS   += -mssse3
+	#MK_CFLAGS   += -mssse3
-	#CXXFLAGS += -mssse3
+	#MK_CXXFLAGS += -mssse3
 endif
 # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
 # https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	CXXFLAGS += -Xassembler -muse-unaligned-vector-move
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	CFLAGS   += -mcpu=native
+	MK_CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
+	MK_CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
 	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS   += -mcpu=power9
+		MK_CFLAGS   += -mcpu=power9
-		CXXFLAGS += -mcpu=power9
+		MK_CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
 	ifeq ($(UNAME_M),ppc64)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
 else
 	CFLAGS += -march=rv64gcv -mabi=lp64d
 	CXXFLAGS +=  -march=rv64gcv -mabi=lp64d
 endif
 ifndef LLAMA_NO_K_QUANTS
-	CFLAGS   += -DGGML_USE_K_QUANTS
+	MK_CPPFLAGS += -DGGML_USE_K_QUANTS
 	CXXFLAGS += -DGGML_USE_K_QUANTS
 	OBJS     += k_quants.o
 ifdef LLAMA_QKK_64
-	CFLAGS   += -DGGML_QKK_64
+	MK_CPPFLAGS += -DGGML_QKK_64
 	CXXFLAGS += -DGGML_QKK_64
 endif
 endif
@ -190,31 +234,32 @@ ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS  += -DGGML_USE_ACCELERATE
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
-		LDFLAGS += -framework Accelerate
+		MK_LDFLAGS  += -framework Accelerate
 	endif
 endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_MPI
-	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	MK_CPPFLAGS += -DGGML_USE_MPI
-	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	MK_CFLAGS   += -Wno-cast-qual
 	MK_CXXFLAGS += -Wno-cast-qual
 	OBJS     += ggml-mpi.o
 endif # LLAMA_MPI
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
+	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
-	LDFLAGS += $(shell pkg-config --libs openblas)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
 ifdef LLAMA_BLIS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
-	LDFLAGS += -lblis -L/usr/local/lib
+	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
@ -265,14 +310,15 @@ endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
-	CFLAGS   += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
-	CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
-		LDFLAGS += -lclblast -framework OpenCL
+		MK_LDFLAGS += -lclblast -framework OpenCL
 	else
-		LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
+		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o
@ -280,11 +326,33 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef LLAMA_HIPBLAS
 	ROCM_PATH	?= /opt/rocm
 	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
 	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 	HIPFLAGS    += -DCC_TURING=1000000000
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 	OBJS        += ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # LLAMA_HIPBLAS
 ifdef LLAMA_METAL
-	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+	MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
-	CXXFLAGS += -DGGML_USE_METAL
+	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	OBJS		+= ggml-metal.o
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL
 ifdef LLAMA_METAL
@ -297,11 +365,17 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
-ifdef LLAMA_NO_K_QUANTS
+ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 # combine build flags with cmdline overrides
 override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
 override CFLAGS   := $(MK_CFLAGS) $(CFLAGS)
 override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
 override LDFLAGS  := $(MK_LDFLAGS) $(LDFLAGS)
 #
 # Print build information
 #
@ -329,23 +403,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 OBJS += ggml-alloc.o
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h build-info.h common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-console.o: examples/console.cpp examples/console.h
+console.o: common/console.cpp common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 #
 # Examples
@ -385,15 +459,33 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
 BUILD_TARGETS += metal
 endif
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -415,29 +507,38 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/Package.swift
+++ b/Package.swift
@ -12,9 +12,18 @@ let package = Package(
            name: "llama",
            path: ".",
            exclude: ["ggml-metal.metal"],
-            sources: ["ggml.c", "llama.cpp"],
+            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "k_quants.c"
            ],
            publicHeadersPath: "spm-headers",
-            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
+            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@ -9,11 +9,23 @@
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
-### 🚧 Incoming breaking change + refactoring:
+### Hot topics
-See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
+- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
-To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
+- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
 - Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
 - A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
  Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
  ### Current `master` should be considered in Beta - expect some issues for a few days!
  ### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
  ### Issues with non-GGUF models will be considered with low priority!
 ----
@ -33,6 +45,7 @@ To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
        <li><a href="#using-gpt4all">Using GPT4All</a></li>
@ -59,12 +72,11 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
- 4-bit, 5-bit and 8-bit integer quantization support
+- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
+- CUDA, Metal and OpenCL GPU backend support
 - cuBLAS and CLBlast support
 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
-Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
+Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
 as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
 **Supported platforms:**
@ -78,6 +90,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [X] Falcon
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
@ -94,104 +107,101 @@ as the main playground for developing new features for the [ggml](https://github
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 **UI:**
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
 ---
-Here is a typical run using LLaMA-7B:
+Here is a typical run using LLaMA v2 13B on M2 Ultra:
 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
-I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -DGGML_USE_ACCELERATE
+I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
+I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
 I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.0 (clang-1400.0.29.202)
+I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX:      Apple clang version 14.0.0 (clang-1400.0.29.202)
+I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 make: Nothing to be done for `default'.
-main: seed = 1678486056
+main: build = 1041 (cf658ad)
-llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
+main: seed  = 1692823051
-llama_model_load: n_vocab = 32000
+llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_load: n_ctx   = 512
+llama_model_loader: - type  f32:   81 tensors
-llama_model_load: n_embd  = 4096
+llama_model_loader: - type q4_0:  281 tensors
-llama_model_load: n_mult  = 256
+llama_model_loader: - type q6_K:    1 tensors
-llama_model_load: n_head  = 32
+llm_load_print_meta: format         = GGUF V1 (latest)
-llama_model_load: n_layer = 32
+llm_load_print_meta: arch           = llama
-llama_model_load: n_rot   = 128
+llm_load_print_meta: vocab type     = SPM
-llama_model_load: f16     = 2
+llm_load_print_meta: n_vocab        = 32000
-llama_model_load: n_ff    = 11008
+llm_load_print_meta: n_merges       = 0
-llama_model_load: ggml ctx size = 4529.34 MB
+llm_load_print_meta: n_ctx_train    = 4096
-llama_model_load: memory_size =   512.00 MB, n_mem = 16384
+llm_load_print_meta: n_ctx          = 512
-llama_model_load: .................................... done
+llm_load_print_meta: n_embd         = 5120
-llama_model_load: model size =  4017.27 MB / num tensors = 291
+llm_load_print_meta: n_head         = 40
 llm_load_print_meta: n_head_kv      = 40
 llm_load_print_meta: n_layer        = 40
 llm_load_print_meta: n_rot          = 128
 llm_load_print_meta: n_gqa          = 1
 llm_load_print_meta: f_norm_eps     = 1.0e-05
 llm_load_print_meta: f_norm_rms_eps = 1.0e-05
 llm_load_print_meta: n_ff           = 13824
 llm_load_print_meta: freq_base      = 10000.0
 llm_load_print_meta: freq_scale     = 1
 llm_load_print_meta: model type     = 13B
 llm_load_print_meta: model ftype    = mostly Q4_0
 llm_load_print_meta: model size     = 13.02 B
 llm_load_print_meta: general.name   = LLaMA v2
 llm_load_print_meta: BOS token = 1 '<s>'
 llm_load_print_meta: EOS token = 2 '</s>'
 llm_load_print_meta: UNK token = 0 '<unk>'
 llm_load_print_meta: LF token  = 13 '<0x0A>'
 llm_load_tensors: ggml ctx size =    0.11 MB
 llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
 ...................................................................................................
 llama_new_context_with_model: kv self size  =  400.00 MB
 llama_new_context_with_model: compute buffer total size =   75.41 MB
-main: prompt: 'Building a website can be done in 10 simple steps:'
+system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-main: number of tokens in prompt = 15
+sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-     1 -> ''
+generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
  8893 -> 'Build'
   292 -> 'ing'
   263 -> ' a'
  4700 -> ' website'
   508 -> ' can'
   367 -> ' be'
  2309 -> ' done'
   297 -> ' in'
 29871 -> ' '
 29896 -> '1'
 29900 -> '0'
  2560 -> ' simple'
  6576 -> ' steps'
 29901 -> ':'
 sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000
-Building a website can be done in 10 simple steps:
+ Building a website can be done in 10 simple steps:
-1) Select a domain name and web hosting plan
+Step 1: Find the right website platform.
-2) Complete a sitemap
+Step 2: Choose your domain name and hosting plan.
-3) List your products
+Step 3: Design your website layout.
-4) Write product descriptions
+Step 4: Write your website content and add images.
-5) Create a user account
+Step 5: Install security features to protect your site from hackers or spammers
-6) Build the template
+Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-7) Start building the website
+Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-8) Advertise the website
+Step 8: Start marketing and promoting the website via social media channels or paid ads
-9) Provide email support
+Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-10) Submit the website to search engines
+Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
+How does a Website Work?
-The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser.
+A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer.
+The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
+How to
-A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen.
+llama_print_timings:        load time =   576.45 ms
-A website can also be viewed on different devices such as desktops, tablets and smartphones.
+llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-Hence, to have a website displayed on a browser, the website must be hosted.
+llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-A domain name is an address of a website. It is the name of the website.
+llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
+llama_print_timings:       total time = 25431.49 ms
 A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen.
 A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted.
 A domain name is an address of a website. It is the name of the website.
 A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
 The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser.
 A website is known as a website when it is hosted
 main: mem per token = 14434244 bytes
 main:     load time =  1332.48 ms
 main:   sample time =  1081.40 ms
 main:  predict time = 31378.77 ms / 61.41 ms per token
 main:    total time = 34036.74 ms
 ```
 And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
@ -291,7 +301,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:
 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```
 ### MPI Build
@ -330,7 +340,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:
 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 ### BLAS Build
@ -419,6 +429,35 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 - #### hipBLAS
  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
  Make sure to have ROCm installed.
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
  Windows support is coming soon...
  - Using `make`:
    ```bash
    make LLAMA_HIPBLAS=1
    ```
  - Using `CMake`:
    ```bash
    mkdir build
    cd build
    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
    cmake --build .
    ```
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
 - #### CLBlast
  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
@ -426,6 +465,8 @@ Building the program with BLAS support may lead to some performance improvements
  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
    - <details>
        <summary>Installing the OpenCL SDK from source</summary>
@ -443,10 +484,27 @@ Building the program with BLAS support may lead to some performance improvements
        ```
      </details>
-  Installing CLBlast: it may be found in your operating system's packages.
+  ##### Installing CLBlast
  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
  Alternatively, they may be built from source.
  - <details>
-    <summary>If not, then installing from source:</summary>
+    <summary>Windows:</summary>
      ```cmd
      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
      git clone https://github.com/CNugteren/CLBlast.git
      mkdir CLBlast\build
      cd CLBlast\build
      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
      cmake --build . --config Release
      cmake --install . --prefix C:/CLBlast
      ```
  - <details>
    <summary>Unix:</summary>
      ```sh
      git clone https://github.com/CNugteren/CLBlast.git
@ -460,21 +518,32 @@ Building the program with BLAS support may lead to some performance improvements
      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
    </details>
-  Building:
+  ##### Building Llama with CLBlast
  - Build with make:
    ```sh
    make LLAMA_CLBLAST=1
    ```
-  - CMake:
+  - CMake (Unix):
    ```sh
    mkdir build
    cd build
    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
    cmake --build . --config Release
    ```
  - CMake (Windows):
    ```cmd
    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
    mkdir build
    cd build
    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
    cmake --build . --config Release
    cmake --install . --prefix C:/LlamaCPP
    ```
-  Running:
+  ##### Running Llama with CLBlast
  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
@ -513,10 +582,10 @@ python3 convert.py models/7B/
  python convert.py models/7B/ --vocabtype bpe
 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -536,6 +605,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 *(outdated)*
 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
@ -572,7 +643,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh
 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@ -598,6 +669,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
 ```
 ### Constrained output with grammars
 `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
 ```bash
 ./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
 ```
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
 ### Instruction mode with Alpaca
 1. First, download the `ggml` Alpaca model into the `./models` folder
@ -635,6 +716,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 *Note: these instructions are likely obsoleted by the GGUF update*
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@ -678,8 +761,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
 - Specify `-eps 1e-5` for best generation quality
 - Specify `-gqa 8` for 70B models to work
 ### Verifying the model files
@ -710,7 +791,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run
 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@ -809,13 +890,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a light image:
 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 ### Docker With CUDA
@ -846,8 +927,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```
 ### Contributing
@ -877,3 +958,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
 - [GBNF grammars](./grammars/README.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
    python3 ../convert.py ${path_models}
-    model_f16="${path_models}/ggml-model-f16.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    function check_ppl {
        qnt="$1"
@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/3B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 # open_llama_7b_v2
@ -285,17 +333,17 @@ function gg_run_open_llama_7b_v2 {
    python3 ../convert.py ${path_models}
-    model_f16="${path_models}/ggml-model-f16.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/7B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # currently not supported by the CUDA backend
    # q8_0
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf 'OpenLLaMA 7B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 ## main
@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    python3 -m pip install -r ${SRC}/requirements.txt
    python3 -m pip install --editable gguf-py
 fi
 ret=0
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,14 @@
 comment: off
 coverage:
  status:
    project:
      default:
        target: auto
        threshold: 0
        base: auto
    patch:
      default:
        target: auto
        threshold: 0
        base: auto
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -0,0 +1,20 @@
 # common
 set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    console.h
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE llama)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -1,15 +1,21 @@
 #include "common.h"
 #include "build-info.h"
 #include "llama.h"
 #include <cassert>
 #include <iostream>
 #include <cstring>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
-#include <sstream>
+#include <cassert>
-#include <unordered_set>
+#include <cmath>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iterator>
 #include <iostream>
 #include <regex>
 #include <sstream>
 #include <string>
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@ -18,12 +24,17 @@
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
+#ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <codecvt>
 #include <locale>
 #include <windows.h>
 #include <fcntl.h>
 #include <io.h>
 #else
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
@ -93,7 +104,6 @@ void process_escapes(std::string& input) {
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    bool escape_prompt = false;
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
@ -125,8 +135,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.prompt = argv[i];
-        } else if (arg == "-e") {
+        } else if (arg == "-e" || arg == "--escape") {
-            escape_prompt = true;
+            params.escape = true;
        } else if (arg == "--prompt-cache") {
            if (++i >= argc) {
                invalid_param = true;
@ -170,18 +180,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "-gqa" || arg == "--gqa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -301,13 +299,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_batch = std::stoi(argv[i]);
            params.n_batch = std::min(512, params.n_batch);
        } else if (arg == "--keep") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_keep = std::stoi(argv[i]);
        } else if (arg == "--draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_draft = std::stoi(argv[i]);
        } else if (arg == "--chunks") {
            if (++i >= argc) {
                invalid_param = true;
@ -320,6 +323,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
        } else if (arg == "-md" || arg == "--model-draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.model_draft = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
@ -400,11 +409,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #else
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
-        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
@ -428,8 +437,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "-ld" || arg == "--logdir") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.logdir = argv[i];
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ppl-stride") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.ppl_stride = std::stoi(argv[i]);
        } else if (arg == "--ppl-output-type") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.ppl_output_type = std::stoi(argv[i]);
        } else if (arg == "--hellaswag") {
            params.hellaswag = true;
        } else if (arg == "--hellaswag-tasks") {
@ -439,7 +470,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
@ -463,6 +494,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, default_params);
 #ifndef LOG_DISABLE_LOGS
            log_print_usage();
 #endif // LOG_DISABLE_LOGS
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@ -502,6 +536,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(params.grammar)
            );
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
            // Do nothing, log_param_single_parse automatically does it's thing
            //  and returns if a match was found and parsed.
        } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
            // We have a matching known parameter requiring an argument,
            //  now we need to check if there is anything after this argv
            //  and flag invalid_param or parse it.
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
                invalid_param = true;
                break;
            }
        // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@ -521,7 +574,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        exit(1);
    }
-    if (escape_prompt) {
+    if (params.escape) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
@ -547,7 +600,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
-    fprintf(stdout, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
    fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
    fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
@ -561,8 +614,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -599,6 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stdout, "  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@ -614,11 +666,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+#ifdef GGML_USE_CUBLAS
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
@ -628,6 +682,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stdout, "  -md FNAME, --model-draft FNAME\n");
    fprintf(stdout, "                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    fprintf(stdout, "  -ld LOGDIR, --logdir LOGDIR\n");
    fprintf(stdout, "                        path under which to save YAML logs (no logging if unset)\n");
    fprintf(stdout, "\n");
 }
@ -650,24 +708,15 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }
-// TODO: not great allocating this every time
+//
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+// Model utils
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+//
    std::vector<llama_token> res(text.size() + (int) add_bos);
    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    assert(n >= 0);
    res.resize(n);
    return res;
 }
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    lparams.n_gqa           = params.n_gqa;
    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
@ -685,7 +734,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return lparams;
 }
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto lparams = llama_context_params_from_gpt_params(params);
    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
@ -714,5 +763,497 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }
    if (params.ignore_eos) {
        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }
    {
        LOG("warming up the model with an empty run\n");
        const std::vector<llama_token> tmp = { llama_token_bos(lctx), };
        llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(lctx);
    }
    return std::make_tuple(model, lctx);
 }
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return result;
 }
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
    }
    return std::string(result.data(), result.size());
 }
 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
    const llama_token bos_id = llama_token_bos(ctx);
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        // remove the leading space of the first non-BOS token
        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
            piece = piece.substr(1);
        }
        result += piece;
    }
    return result;
 }
 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
    std::string piece;
    std::string result;
    for (size_t i = 0; i < tokens.size(); ++i) {
        piece = llama_token_to_piece(ctx, tokens[i]);
        result += piece;
    }
    return result;
 }
 //
 // Sampling utils
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(ctx);
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    const float   repeat_penalty  = params.repeat_penalty;
    const float   alpha_presence  = params.presence_penalty;
    const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
    const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    float * logits = llama_get_logits(ctx) + idx * n_vocab;
    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    candidates.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
    if (ctx_guidance) {
        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
    }
    // apply penalties
    if (!last_tokens.empty()) {
        const float nl_logit = logits[llama_token_nl(ctx)];
        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
        llama_sample_repetition_penalty(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, repeat_penalty);
        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, alpha_frequency, alpha_presence);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    if (grammar != NULL) {
        llama_sample_grammar(ctx, &cur_p, grammar);
    }
    if (temp <= 0) {
        // Greedy sampling
        id = llama_sample_token_greedy(ctx, &cur_p);
    } else {
        if (mirostat == 1) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            const int mirostat_m = 100;
            llama_sample_temperature(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
        } else if (mirostat == 2) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            llama_sample_temperature(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
        } else {
            // Temperature sampling
            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
            llama_sample_temperature(ctx, &cur_p, temp);
            {
                const int n_top = 10;
                LOG("top %d candidates:\n", n_top);
                for (int i = 0; i < n_top; i++) {
                    const llama_token id = cur_p.data[i].id;
                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                }
            }
            id = llama_sample_token(ctx, &cur_p);
            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
        }
    }
    // printf("`%d`", candidates_p.size);
    if (grammar != NULL) {
        llama_grammar_accept_token(ctx, grammar, id);
    }
    return id;
 }
 //
 // YAML utils
 //
 // returns true if successful, false otherwise
 bool create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
    std::wstring wpath = converter.from_bytes(path);
    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
        return true;
    }
    size_t pos_slash = 0;
    // process path from front to back, procedurally creating directories
    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
        const std::wstring subpath = wpath.substr(0, pos_slash);
        const wchar_t * test = subpath.c_str();
        const bool success = CreateDirectoryW(test, NULL);
        if (!success) {
            const DWORD error = GetLastError();
            // if the path already exists, ensure that it's a directory
            if (error == ERROR_ALREADY_EXISTS) {
                const DWORD attributes = GetFileAttributesW(subpath.c_str());
                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
                    return false;
                }
            } else {
                return false;
            }
        }
        pos_slash += 1;
    }
    return true;
 #else
    // if the path already exists, check whether it's a directory
    struct stat info;
    if (stat(path.c_str(), &info) == 0) {
        return S_ISDIR(info.st_mode);
    }
    size_t pos_slash = 1; // skip leading slashes for directory creation
    // process path from front to back, procedurally creating directories
    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
        const std::string subpath = path.substr(0, pos_slash);
        struct stat info;
        // if the path already exists, ensure that it's a directory
        if (stat(subpath.c_str(), &info) == 0) {
            if (!S_ISDIR(info.st_mode)) {
                return false;
            }
        } else {
            // create parent directories
            const int ret = mkdir(subpath.c_str(), 0755);
            if (ret != 0) {
                return false;
            }
        }
        pos_slash += 1;
    }
    return true;
 #endif // _WIN32
 }
 void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%e, ", data[i]);
    }
    fprintf(stream, "%e]\n", data.back());
 }
 void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%d, ", data[i]);
    }
    fprintf(stream, "%d]\n", data.back());
 }
 void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
    std::string data_str(data == NULL ? "" : data);
    if (data_str.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    size_t pos_start = 0;
    size_t pos_found = 0;
    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
        data_str = "\"" + data_str + "\"";
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    if (data_str.find('\n') == std::string::npos) {
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    fprintf(stream, "%s: |\n", prop_name);
    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
        pos_start = pos_found + 1;
    }
 }
 std::string get_sortable_timestamp() {
    using clock = std::chrono::system_clock;
    const clock::time_point current_time = clock::now();
    const time_t as_time_t = clock::to_time_t(current_time);
    char timestamp_no_ns[100];
    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
        current_time.time_since_epoch() % 1000000000).count();
    char timestamp_ns[11];
    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
 #else
    fprintf(stream, "debug: true\n");
 #endif // NDEBUG
    fprintf(stream, "model_desc: %s\n", model_desc);
    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
 #else
    fprintf(stream, "optimize: false\n");
 #endif // __OPTIMIZE__
    fprintf(stream, "time: %s\n", timestamp.c_str());
    fprintf(stream, "\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "# User Inputs #\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "\n");
    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
    fprintf(stream, "logit_bias:\n");
    for (std::pair<llama_token, float> lb : params.logit_bias) {
        if (ignore_eos && lb.first == logit_bias_eos->first) {
            continue;
        }
        fprintf(stream, "  %d: %f", lb.first, lb.second);
    }
    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
        size_t pos = 0;
        while ((pos = ap.find('\n', pos)) != std::string::npos) {
            ap.replace(pos, 1, "\\n");
            pos += 1;
        }
        fprintf(stream, "  - %s\n", ap.c_str());
    }
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -4,6 +4,9 @@
 #include "llama.h"
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 #include <string>
 #include <vector>
 #include <random>
@ -11,6 +14,12 @@
 #include <unordered_map>
 #include <tuple>
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
 #define DIRECTORY_SEPARATOR '/'
 #endif // _WIN32
 //
 // CLI argument parsing
 //
@ -22,19 +31,18 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
-    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
+    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
@ -48,12 +56,15 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance
-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@ -61,15 +72,20 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
                                    //
    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
-    bool mul_mat_q         = false; // if true, use experimental mul_mat_q kernels
+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@ -78,11 +94,13 @@ struct gpt_params {
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
    bool embedding         = false; // get only sentence embedding
    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
@ -100,15 +118,86 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
 //
 // Model utils
 //
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 //
 // Vocab utils
 //
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
        const struct llama_context * ctx,
                       llama_token   token);
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
 //
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // removes the leading space from the first non-BOS token
 std::string llama_detokenize_spm(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 //
 // Sampling utils
 //
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 //
 // required:
 //  - ctx:    context to use for sampling
 //  - params: sampling parameters
 //
 // optional:
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
 //  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx = 0);
 //
 // YAML utils
 //
 bool create_directory_with_parents(const std::string & path);
 void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
 void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
 void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
 std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/examples/console.cpp
+++ b/examples/console.cpp
@ -235,6 +235,7 @@ namespace console {
    int estimateWidth(char32_t codepoint) {
 #if defined(_WIN32)
        (void)codepoint;
        return 1;
 #else
        return wcwidth(codepoint);
--- a/examples/console.h
+++ b/examples/console.h
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
--- a/common/log.h
+++ b/common/log.h
@ -0,0 +1,643 @@
 #pragma once
 #include <chrono>
 #include <cstring>
 #include <sstream>
 #include <iostream>
 #include <thread>
 #include <vector>
 #include <algorithm>
 #include <cinttypes>
 // --------------------------------
 //
 // Basic usage:
 //
 // --------
 //
 //  The LOG() and LOG_TEE() macros are ready to go by default
 //   they do not require any initialization.
 //
 //  LOGLN() and LOG_TEELN() are variants which automatically
 //   include \n character at the end of the log string.
 //
 //  LOG() behaves exactly like printf, by default writing to a logfile.
 //  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
 //
 //  Default logfile is named
 //   "llama.<threadID>.log"
 //  Default LOG_TEE() secondary output target is
 //   stderr
 //
 //  Logs can be dynamically disabled or enabled using functions:
 //   log_disable()
 //  and
 //   log_enable()
 //
 //  A log target can be changed with:
 //   log_set_target( string )
 //    creating and opening, or re-opening a file by string filename
 //  or
 //   log_set_target( FILE* )
 //    allowing to point at stderr, stdout, or any valid FILE* file handler.
 //
 // --------
 //
 // End of Basic usage.
 //
 // --------------------------------
 // Specifies a log target.
 //  default uses log_handler() with "llama.log" log file
 //  this can be changed, by defining LOG_TARGET
 //  like so:
 //
 //  #define LOG_TARGET (a valid FILE*)
 //  #include "log.h"
 //
 //  or it can be simply redirected to stdout or stderr
 //  like so:
 //
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
 //  The log target can also be redirected to a diffrent function
 //  like so:
 //
 //  #define LOG_TARGET log_handler_diffrent()
 //  #include "log.h"
 //
 //  FILE* log_handler_diffrent()
 //  {
 //      return stderr;
 //  }
 //
 //  or:
 //
 //  #define LOG_TARGET log_handler_another_one("somelog.log")
 //  #include "log.h"
 //
 //  FILE* log_handler_another_one(char*filename)
 //  {
 //      static FILE* logfile = nullptr;
 //      (...)
 //      if( !logfile )
 //      {
 //          fopen(...)
 //      }
 //      (...)
 //      return logfile
 //  }
 //
 #ifndef LOG_TARGET
    #define LOG_TARGET log_handler()
 #endif
 #ifndef LOG_TEE_TARGET
    #define LOG_TEE_TARGET stderr
 #endif
 // Utility to obtain "pid" like unique process id and use it when creating log files.
 inline std::string log_get_pid()
 {
    static std::string pid;
    if (pid.empty())
    {
        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
        //  it's not the same as "pid" but is unique enough to solve multiple instances
        //  trying to write to the same log.
        std::stringstream ss;
        ss << std::this_thread::get_id();
        pid = ss.str();
    }
    return pid;
 }
 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.
 #define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
 // INTERNAL, DO NOT USE
 inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
 {
    std::stringstream buf;
    buf << log_file_basename;
    buf << ".";
    buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;
    return buf.str();
 }
 #ifndef LOG_DEFAULT_FILE_NAME
    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
 #endif
 // Utility for turning #define values into string literals
 //  so we can have a define for stderr and
 //  we can print "stderr" instead of literal stderr, etc.
 #define LOG_STRINGIZE1(s) #s
 #define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
 #define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
 // Allows disabling timestamps.
 //  in order to disable, define LOG_NO_TIMESTAMPS
 //  like so:
 //
 //  #define LOG_NO_TIMESTAMPS
 //  #include "log.h"
 //
 #ifndef LOG_NO_TIMESTAMPS
    #ifndef _MSC_VER
        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #else
        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #endif
 #else
    #define LOG_TIMESTAMP_FMT "%s"
    #define LOG_TIMESTAMP_VAL ,""
 #endif
 #ifdef LOG_TEE_TIMESTAMPS
    #ifndef _MSC_VER
        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #else
        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
    #endif
 #else
    #define LOG_TEE_TIMESTAMP_FMT "%s"
    #define LOG_TEE_TIMESTAMP_VAL ,""
 #endif
 // Allows disabling file/line/function prefix
 //  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
 //  like so:
 //
 //  #define LOG_NO_FILE_LINE_FUNCTION
 //  #include "log.h"
 //
 #ifndef LOG_NO_FILE_LINE_FUNCTION
    #ifndef _MSC_VER
        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_FLF_FMT "%s"
    #define LOG_FLF_VAL ,""
 #endif
 #ifdef LOG_TEE_FILE_LINE_FUNCTION
    #ifndef _MSC_VER
        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_TEE_FLF_FMT "%s"
    #define LOG_TEE_FLF_VAL ,""
 #endif
 // Utility for synchronizing log configuration state
 //  since std::optional was introduced only in c++17
 enum LogTriState
 {
    LogTriStateSame,
    LogTriStateFalse,
    LogTriStateTrue
 };
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
 #ifndef _MSC_VER
    #define LOG_IMPL(str, ...)                                                                                          \
    {                                                                                                               \
        if (LOG_TARGET != nullptr)                                                                                  \
        {                                                                                                           \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                     \
        }                                                                                                           \
    }
 #else
    #define LOG_IMPL(str, ...)                                                                                               \
    {                                                                                                                    \
        if (LOG_TARGET != nullptr)                                                                                       \
        {                                                                                                                \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                          \
        }                                                                                                                \
    }
 #endif
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
 #ifndef _MSC_VER
    #define LOG_TEE_IMPL(str, ...)                                                                                                          \
    {                                                                                                                                   \
        if (LOG_TARGET != nullptr)                                                                                                      \
        {                                                                                                                               \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
            fflush(LOG_TARGET);                                                                                                         \
        }                                                                                                                               \
        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
        {                                                                                                                               \
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                     \
        }                                                                                                                               \
    }
 #else
    #define LOG_TEE_IMPL(str, ...)                                                                                                               \
    {                                                                                                                                        \
        if (LOG_TARGET != nullptr)                                                                                                           \
        {                                                                                                                                    \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
            fflush(LOG_TARGET);                                                                                                              \
        }                                                                                                                                    \
        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
        {                                                                                                                                    \
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                          \
        }                                                                                                                                    \
    }
 #endif
 // The '\0' as a last argument, is a trick to bypass the silly
 //  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
 //  so we can have a single macro which can be called just like printf.
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
 #ifndef _MSC_VER
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
 #endif
 // Main TEE macro.
 //  does the same as LOG
 //  and
 //  simultaneously writes stderr.
 //
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
 #ifndef _MSC_VER
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
 #endif
 // LOG macro variants with auto endline.
 #ifndef _MSC_VER
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
 #endif
 // INTERNAL, DO NOT USE
 inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
    static bool _initialized{false};
    static bool _disabled{(filename.empty() && target == nullptr)};
    static std::string log_current_filename{filename};
    static FILE *log_current_target{target};
    static FILE *logfile = nullptr;
    if (change)
    {
        if (disable == LogTriStateTrue)
        {
            // Disable primary target
            _disabled = true;
        }
        // If previously disabled, only enable, and keep previous target
        else if (disable == LogTriStateFalse)
        {
            _disabled = false;
        }
        // Otherwise, process the arguments
        else if (log_current_filename != filename || log_current_target != target)
        {
            _initialized = false;
        }
    }
    if (_disabled)
    {
        // Log is disabled
        return nullptr;
    }
    if (_initialized)
    {
        // with fallback in case something went wrong
        return logfile ? logfile : stderr;
    }
    // do the (re)initialization
    if (target != nullptr)
    {
        if (logfile != nullptr && logfile != stdout && logfile != stderr)
        {
            fclose(logfile);
        }
        log_current_filename = LOG_DEFAULT_FILE_NAME;
        log_current_target = target;
        logfile = target;
    }
    else
    {
        if (log_current_filename != filename)
        {
            if (logfile != nullptr && logfile != stdout && logfile != stderr)
            {
                fclose(logfile);
            }
        }
        logfile = fopen(filename.c_str(), "w");
    }
    if (!logfile)
    {
        //  Verify whether the file was opened, otherwise fallback to stderr
        logfile = stderr;
        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
        fflush(stderr);
        // At this point we let the init flag be to true below, and let the target fallback to stderr
        //  otherwise we would repeatedly fopen() which was already unsuccessful
    }
    _initialized = true;
    return logfile ? logfile : stderr;
 }
 // INTERNAL, DO NOT USE
 inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
    return log_handler1_impl(change, disable, filename, target);
 }
 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
 //  untill enabled back.
 #define log_disable() log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
    return log_handler1_impl(true, LogTriStateTrue);
 }
 // Enables logs at runtime.
 #define log_enable() log_enable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
    return log_handler1_impl(true, LogTriStateFalse);
 }
 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)
 // INTERNAL, DO NOT USE
 inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
 inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }
 inline void log_test()
 {
    log_disable();
    LOG("01 Hello World to nobody, because logs are disabled!\n")
    log_enable();
    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
    log_set_target(stderr);
    LOG("04 Hello World to stderr!\n")
    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
    LOG("06 Hello World to default log file!\n")
    log_set_target(stdout);
    LOG("07 Hello World to stdout!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
    LOG("08 Hello World to default log file again!\n")
    log_disable();
    LOG("09 Hello World _1_ into the void!\n")
    log_enable();
    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
    log_disable();
    log_set_target("llama.anotherlog.log");
    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
    log_enable();
    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
    log_set_target("llama.yetanotherlog.log");
    LOG("13 Hello World this time in yet new file?\n")
    log_set_target(log_filename_generator("llama_autonamed", "log"));
    LOG("14 Hello World in log with generated filename!\n")
 #ifdef _MSC_VER
    LOG_TEE("15 Hello msvc TEE without arguments\n")
    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
    LOG_TEELN("17 Hello msvc TEELN without arguments\n")
    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
    LOG("19 Hello msvc LOG without arguments\n")
    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
    LOGLN("21 Hello msvc LOGLN without arguments\n")
    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
 #endif
 }
 inline bool log_param_single_parse(const std::string & param)
 {
    if ( param == "--log-test")
    {
        log_test();
        return true;
    }
    if ( param == "--log-disable")
    {
        log_disable();
        return true;
    }
    if ( param == "--log-enable")
    {
        log_enable();
        return true;
    }
    return false;
 }
 inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
 {
    if ( param == "--log-file")
    {
        if (!check_but_dont_parse)
        {
            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
        }
        return true;
    }
    return false;
 }
 inline void log_print_usage()
 {
    fprintf(stdout, "log options:\n");
    /* format
    fprintf(stdout, "  -h, --help            show this help message and exit\n");*/
    /* spacing
    fprintf(stdout, "__-param----------------Description\n");*/
    fprintf(stdout, "  --log-test            Run simple logging test\n");
    fprintf(stdout, "  --log-disable         Disable trace logs\n");
    fprintf(stdout, "  --log-enable          Enable trace logs\n");
    fprintf(stdout, "  --log-file            Specify a log filename (without extension)\n");
    fprintf(stdout, "                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
 }
 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
 // INTERNAL, DO NOT USE
 inline void log_dump_cmdline_impl(int argc, char **argv)
 {
    std::stringstream buf;
    for (int i = 0; i < argc; ++i)
    {
        if (std::string(argv[i]).find(' ') != std::string::npos)
        {
            buf << " \"" << argv[i] <<"\"";
        }
        else
        {
            buf << " " << argv[i];
        }
    }
    LOGLN("Cmd:%s", buf.str().c_str())
 }
 #define log_tostr(var) log_var_to_string_impl(var).c_str()
 inline std::string log_var_to_string_impl(bool var)
 {
    return var ? "true" : "false";
 }
 inline std::string log_var_to_string_impl(std::string var)
 {
    return var;
 }
 inline std::string log_var_to_string_impl(const std::vector<int> & var)
 {
    std::stringstream buf;
    buf << "[ ";
    bool first = true;
    for (auto e : var)
    {
        if (first)
        {
            first = false;
        }
        else
        {
            buf << ", ";
        }
        buf << std::to_string(e);
    }
    buf << " ]";
    return buf.str();
 }
 #define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
    [&tokens, &ctx]()                                                        \
    {                                                                        \
        std::stringstream buf;                                               \
        buf << "[ ";                                                         \
                                                                             \
        bool first = true;                                                   \
        for (const auto &token : tokens)                                     \
        {                                                                    \
            if (!first)                                                      \
                buf << ", ";                                                 \
            else                                                             \
                first = false;                                               \
                                                                             \
            auto detokenized = llama_token_to_piece(ctx, token);             \
                                                                             \
            detokenized.erase(                                               \
                std::remove_if(                                              \
                    detokenized.begin(),                                     \
                    detokenized.end(),                                       \
                    [](const unsigned char c) { return !std::isprint(c); }), \
                detokenized.end());                                          \
                                                                             \
            buf                                                              \
                << "'" << detokenized << "'"                                 \
                << ":" << std::to_string(token);                             \
        }                                                                    \
        buf << " ]";                                                         \
                                                                             \
        return buf.str();                                                    \
    }()                                                                      \
        .c_str()
 #ifdef LOG_DISABLE_LOGS
 #undef LOG
 #define LOG(...) // dummy stub
 #undef LOGLN
 #define LOGLN(...) // dummy stub
 #undef LOG_TEE
 #define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
 #undef LOG_TEELN
 #define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
 #undef LOG_DISABLE
 #define LOG_DISABLE() // dummy stub
 #undef LOG_ENABLE
 #define LOG_ENABLE() // dummy stub
 #undef LOG_ENABLE
 #define LOG_ENABLE() // dummy stub
 #undef LOG_SET_TARGET
 #define LOG_SET_TARGET(...) // dummy stub
 #undef LOG_DUMP_CMDLINE
 #define LOG_DUMP_CMDLINE(...) // dummy stub
 #endif // LOG_DISABLE_LOGS
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -0,0 +1,267 @@
 #!/usr/bin/env python3
 # HF falcon--> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "RWForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.FALCON
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layer"]
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
 gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(hparams["n_head"])
 if "n_head_kv" in hparams:
    gguf_writer.add_head_count_kv(hparams["n_head_kv"])
 else:
    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 vocab_size = len(tokenizer_json["model"]["vocab"])
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
    if i in reverse_vocab:
        try:
            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
    scores.append(0.0)                      # dymmy
    toktypes.append(gguf.TokenType.NORMAL)  # dummy
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # params for qkv transform
 n_head    = hparams["n_head"]
 n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
 head_dim = hparams["hidden_size"] // n_head
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(dir_model / part_name, map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        # QKV tensor transform
        # The original query_key_value tensor contains n_head_kv "kv groups",
        # each consisting of n_head/n_head_kv query weights followed by one key
        # and one value weight (shared by all query heads in the kv group).
        # This layout makes it a big pain to work with in GGML.
        # So we rearrange them here,, so that we have n_head query weights
        # followed by n_head_kv key weights followed by n_head_kv value weights,
        # in contiguous fashion.
        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
        if "query_key_value" in name:
            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
            data = torch.cat((q,k,v)).reshape_as(data)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -0,0 +1,239 @@
 #!/usr/bin/env python3
 # HF gptneox--> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "GPTNeoXForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.GPTNEOX
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["num_hidden_layers"]
 gguf_writer.add_name(dir_model.name)
 gguf_writer.add_context_length(hparams["max_position_embeddings"])
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
 gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
 gguf_writer.add_head_count(hparams["num_attention_heads"])
 gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 vocab_size = len(tokenizer_json["model"]["vocab"])
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
    if i in reverse_vocab:
        try:
            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_token_list(tokens)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        # we don't need these
        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@ -0,0 +1,353 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import math
 import struct
 import sys
 from pathlib import Path
 import numpy as np
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 # Note: Does not support GGML_QKK_64
 QK_K = 256
 # Items here are (block size, type size)
 GGML_QUANT_SIZES = {
    gguf.GGMLQuantizationType.F32  : (1, 4),
    gguf.GGMLQuantizationType.F16  : (1, 2),
    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
 }
 class Hyperparameters:
    def __init__(self):
        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
        self.n_ff = 0
    def set_n_ff(self, model):
        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
        ff_tensor = model.tensors[ff_tensor_idx]
        self.n_ff = ff_tensor.dims[1]
    def load(self, data, offset):
        (
            self.n_vocab,
            self.n_embd,
            self.n_mult,
            self.n_head,
            self.n_layer,
            self.n_rot,
            self.ftype,
        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
        return 4 * 7
    def __str__(self):
        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
 class Vocab:
    def __init__(self):
        self.items = []
    def load(self, data, offset, n_vocab):
        orig_offset = offset
        for _ in range(n_vocab):
            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
            assert itemlen < 4096, 'Absurd vocab item length'
            offset += 4
            vocab = bytes(data[offset:offset + itemlen])
            offset += itemlen
            score = struct.unpack('<f', data[offset:offset + 4])[0]
            offset += 4
            self.items.append((vocab, score))
        return offset - orig_offset
 class Tensor:
    def __init__(self):
        self.name = None
        self.dims: tuple[int, ...] = ()
        self.dtype = None
        self.start_offset = 0
        self.len_bytes = np.int64(0)
    def load(self, data, offset):
        orig_offset = offset
        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
        assert name_len < 4096, 'Absurd tensor name length'
        quant = GGML_QUANT_SIZES.get(dtype)
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
        self.dtype= dtype
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
        offset += name_len
        pad = ((offset + 31) & ~31) - offset
        offset += pad
        n_elems = np.prod(self.dims)
        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
        self.start_offset = offset
        self.len_bytes = n_bytes
        offset += n_bytes
        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset
 class GGMLV3Model:
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
        self.tensor_map = {}
        self.tensors = []
    def validate_header(self, data, offset):
        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
            raise ValueError('Only GGJTv3 supported')
        return 8
    def load(self, data, offset):
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
        vocab = Vocab()
        offset += vocab.load(data, offset, hp.n_vocab)
        tensors: list[Tensor] = []
        tensor_map = {}
        while offset < len(data):
            tensor = Tensor()
            offset += tensor.load(data, offset)
            tensor_map[tensor.name] = len(tensors)
            tensors.append(tensor)
        self.hyperparameters = hp
        self.vocab = vocab
        self.tensors = tensors
        self.tensor_map = tensor_map
        hp.set_n_ff(self)
        return offset
 class GGMLToGGUF:
    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
        hp = ggml_model.hyperparameters
        self.model = ggml_model
        self.data = data
        self.cfg = cfg
        self.params_override = params_override
        self.vocab_override = vocab_override
        self.special_vocab = special_vocab
        if params_override is not None:
            n_kv_head = params_override.n_head_kv
        else:
            if cfg.gqa == 1:
                n_kv_head = hp.n_head
            else:
                gqa = float(cfg.gqa)
                n_kv_head = None
                for x in range(1, 256):
                    if float(hp.n_head) / float(x) == gqa:
                        n_kv_head = x
                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
        self.n_kv_head = n_kv_head
        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
    def save(self):
        print('* Preparing to save GGUF file')
        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
        if self.special_vocab is not None:
            self.special_vocab.add_to_gguf(gguf_writer)
        self.add_tensors(gguf_writer)
        print("    gguf: write header")
        gguf_writer.write_header_to_file()
        print("    gguf: write metadata")
        gguf_writer.write_kv_data_to_file()
        print("    gguf: write tensors")
        gguf_writer.write_tensors_to_file()
        gguf_writer.close()
    def add_params(self, gguf_writer):
        hp = self.model.hyperparameters
        cfg = self.cfg
        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
        try:
            # Filenames aren't necessarily valid UTF8.
            name = cfg.name if cfg.name is not None else cfg.input.name
        except UnicodeDecodeError:
            name = None
        print('* Adding model parameters and KV items')
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
        if self.params_override is not None:
            po = self.params_override
            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
            gguf_writer.add_context_length      (po.n_ctx)
            gguf_writer.add_embedding_length    (po.n_embd)
            gguf_writer.add_block_count         (po.n_layer)
            gguf_writer.add_feed_forward_length (po.n_ff)
            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
            gguf_writer.add_head_count          (po.n_head)
            gguf_writer.add_head_count_kv       (po.n_head_kv)
            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
            return
        gguf_writer.add_context_length(cfg.context_length)
        gguf_writer.add_embedding_length(hp.n_embd)
        gguf_writer.add_block_count(hp.n_layer)
        gguf_writer.add_feed_forward_length(hp.n_ff)
        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
        gguf_writer.add_head_count(hp.n_head)
        gguf_writer.add_head_count_kv(self.n_kv_head)
        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
        gguf_writer.add_tokenizer_model('llama')
        tokens = []
        scores = []
        toktypes = []
        if self.vocab_override is not None:
            vo = self.vocab_override
            print('* Adding vocab item(s)')
            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
            gguf_writer.add_token_list(tokens)
            gguf_writer.add_token_scores(scores)
            if len(toktypes) > 0:
                gguf_writer.add_token_types(toktypes)
            return
        print(f'* Adding {hp.n_vocab} vocab item(s)')
        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
            tt = 1 # Normal
            # Special handling for UNK, BOS, EOS tokens.
            if tokid <= 2:
                if tokid == 0:
                    vbytes = b'<unk>'
                    tt = 2
                elif tokid == 1:
                    vbytes = b'<s>'
                    tt = 3
                else:
                    vbytes = b'</s>'
                    tt = 3
            elif len(vbytes) == 0:
                tt = 3 # Control
            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
                tt = 6 # Byte
            else:
                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
            toktypes.append(tt)
            tokens.append(vbytes)
            scores.append(vscore)
        gguf_writer.add_token_list(tokens)
        gguf_writer.add_token_scores(scores)
        gguf_writer.add_token_types(toktypes)
        gguf_writer.add_unk_token_id(0)
        gguf_writer.add_bos_token_id(1)
        gguf_writer.add_eos_token_id(2)
    def add_tensors(self, gguf_writer):
        tensor_map = self.name_map
        data = self.data
        print(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
            assert mapped_name is not None, f'Bad name {name}'
            tempdims = list(tensor.dims[:])
            if len(tempdims) > 1:
                temp = tempdims[1]
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
 def handle_metadata(cfg, hp):
    import convert
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
    # We pass a fake model here. "original" mode will check the shapes of some
    # tensors if information is missing in the .json file: other than that, the
    # model data isn't used so this should be safe (at least for now).
    fakemodel = {
        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
    }
    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
    if hf_config_path.exists():
        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
    elif orig_config_path.exists():
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
    return (params, vocab, svocab)
 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
    parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
    parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
    parser.add_argument('--name', help = 'Set model name')
    parser.add_argument('--desc', help = 'Set model description')
    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
    return parser.parse_args()
 def main():
    cfg = handle_args()
    print(f'* Using config: {cfg}')
    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLV3Model()
    print('* Scanning GGML input file')
    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
    special_vocab = None
    if cfg.model_metadata_dir is not None:
        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
        print(f'* Overriding params: {params_override}')
        print(f'* Overriding vocab: {vocab_override}')
        print(f'* Special vocab: {special_vocab}')
    else:
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')
 if __name__ == '__main__':
    main()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -1,28 +1,29 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 from __future__ import annotations
 import json
 import os
 import re
 import struct
 import sys
-from typing import Any, Dict, Sequence, TextIO
+from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
-from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 HF_SUBLAYER_TO_GGML = {
-    "self_attn.q_proj": "attention.wq",
+    "self_attn.q_proj": "attn_q",
-    "self_attn.k_proj": "attention.wk",
+    "self_attn.k_proj": "attn_k",
-    "self_attn.v_proj": "attention.wv",
+    "self_attn.v_proj": "attn_v",
-    "self_attn.o_proj": "attention.wo",
+    "self_attn.o_proj": "attn_output",
-    "mlp.gate_proj": "feed_forward.w1",
+    "mlp.gate_proj": "ffn_gate",
-    "mlp.down_proj": "feed_forward.w2",
+    "mlp.down_proj": "ffn_down",
-    "mlp.up_proj": "feed_forward.w3",
+    "mlp.up_proj": "ffn_up",
-    "input_layernorm": "attention_norm",
+    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
    # "norm": "norm",
    # "embed_tokens": "tok_embeddings",
    # "lm_head": "output",
 }
@ -39,7 +40,7 @@ def translate_tensor_name(t: str) -> str:
            sys.exit(1)
        output_string = (
-            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
@ -47,19 +48,21 @@ def translate_tensor_name(t: str) -> str:
        sys.exit(1)
-def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
+def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
    fout.write(struct.pack("i", params["r"]))
    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
    # but some models ship a float value instead
    # let's convert to int, but fail if lossless conversion is not possible
-    assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
+    assert (
        int(params["lora_alpha"]) == params["lora_alpha"]
    ), "cannot convert float to int losslessly"
    fout.write(struct.pack("i", int(params["lora_alpha"])))
 def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: DataType
+    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
@ -67,7 +70,7 @@ def write_tensor_header(
            "iii",
            len(shape),
            len(sname),
-            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+            NUMPY_TYPE_TO_FTYPE[data_type.name],
        )
    )
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -1,13 +0,0 @@
 # Compatibility stub
 import argparse
 import convert
 parser = argparse.ArgumentParser(
    description="""[DEPRECATED - use `convert.py` instead]
    Convert a LLaMA model checkpoint to a ggml compatible file""")
 parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 args = parser.parse_args()
 convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
 Result:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
 # ...
 # common
 set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    console.h
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE llama)
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@ -44,8 +23,10 @@ else()
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) {
        float error_before_opt = ggml_get_f32_1d(e, 0);
        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
        opt_params_adam.print_forward_graph = false;
        opt_params_adam.print_backward_graph = false;
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
        opt_params_adam.adam.n_iter = 16;
        opt_params_lbfgs.lbfgs.n_iter = 16;
        // ggml_opt(ctx0, opt_params_adam, e);
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
        ggml_build_forward_expand(&gf, e);
--- a/examples/beam-search/CMakeLists.txt
+++ b/examples/beam-search/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET beam-search)
 add_executable(${TARGET} beam-search.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -0,0 +1,190 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #include <signal.h>
 #endif
 // Used for debugging to print out beam tokens.
 struct ostream_beam_view {
    llama_context * ctx;
    llama_beam_view beam_view;
 };
 std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
    }
    return os << ')';
 }
 // Put here anything you want back in beam_search_callback().
 struct beam_search_callback_data {
    llama_context * ctx;
    std::vector<llama_token> response;
 };
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
 }
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        callback_data.response.resize(callback_data.response.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
        printf("%zu", n);
    }
    fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
    }
 #endif
 }
 int main(int argc, char ** argv)
 {
    gpt_params params;
    //params.n_gpu_layers = 200;
    //---------------------------------
    // Print help :
    //---------------------------------
    if ( argc < 2 || argv[1][0] == '-' )
    {
        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
        return 1 ;
    }
    //---------------------------------
    // Load parameters :
    //---------------------------------
    params.model = argv[1];
    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
    if ( argc > 3 )
    {
        params.prompt = argv[3];
    }
    if ( params.prompt.empty() )
    {
        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
    }
    //---------------------------------
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params( params );
    if ( model == NULL )
    {
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
    //---------------------------------
    // Tokenize the prompt :
    //---------------------------------
    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
    const size_t max_context_size     = llama_n_ctx( ctx );
    const size_t max_tokens_list_size = max_context_size - 4 ;
    if (tokens_list.size() > max_tokens_list_size)
    {
        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }
    fprintf( stderr, "\n\n" );
    // Print the tokens from the prompt :
    for( auto id : tokens_list )
    {
        std::cout << llama_token_to_piece(ctx, id);
    }
    std::cout << std::flush;
    int n_past = llama_get_kv_cache_token_count(ctx);
    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
    }
    n_past += tokens_list.size();
    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
        std::cout << llama_token_to_piece(ctx,token_id);
    }
    std::cout << std::endl;
    llama_free( ctx );
    llama_free_model( model );
    llama_backend_free();
    return 0;
 }
--- a/examples/chat.sh
+++ b/examples/chat.sh
@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
+./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color -i \
    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -12,15 +12,15 @@ usage: ./convert-llama2c-to-ggml [options]
 options:
  -h, --help                       show this help message and exit
-  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
 ```
-An example command is as follows:
+An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
-Now you can use the model with command like:
+Now you can use the model with a command like:
-`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
+`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@ -9,13 +10,60 @@
 #include <ctime>
 #include <random>
 #include <stdexcept>
 #include <sstream>
 #include <algorithm>
 #include <string>
 // GGUF keys & tensor names.
 #define KV_GENERAL_ARCHITECTURE          "general.architecture"
 #define KV_GENERAL_NAME                  "general.name"
 #define KV_TOKENIZER_MODEL               "tokenizer.ggml.model"
 #define KV_TOKENIZER_LIST                "tokenizer.ggml.tokens"
 #define KV_TOKENIZER_TOKEN_TYPE          "tokenizer.ggml.token_type"
 #define KV_TOKENIZER_SCORES              "tokenizer.ggml.scores"
 #define KV_TOKENIZER_BOS_ID              "tokenizer.ggml.bos_token_id"
 #define KV_TOKENIZER_EOS_ID              "tokenizer.ggml.eos_token_id"
 #define KV_TOKENIZER_UNK_ID              "tokenizer.ggml.unknown_token_id"
 #define KV_TOKENIZER_SEP_ID              "tokenizer.ggml.seperator_token_id"
 #define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
 #define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
 #define KV_CONTEXT_LENGTH                "llama.context_length"
 #define KV_EMBEDDING_LENGTH              "llama.embedding_length"
 #define KV_BLOCK_COUNT                   "llama.block_count"
 #define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
 #define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
 #define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
 #define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
 #define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
 #define TN_TOKEN_EMBD  "token_embd.weight"
 #define TN_OUTPUT_NORM "output_norm.weight"
 #define TN_OUTPUT      "output.weight"
 #define TN_ATTN_NORM   "blk.%d.attn_norm.weight"
 #define TN_ATTN_Q      "blk.%d.attn_q.weight"
 #define TN_ATTN_K      "blk.%d.attn_k.weight"
 #define TN_ATTN_V      "blk.%d.attn_v.weight"
 #define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
 #define TN_FFN_NORM    "blk.%d.ffn_norm.weight"
 #define TN_FFN_GATE    "blk.%d.ffn_gate.weight"
 #define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
 #define TN_FFN_UP      "blk.%d.ffn_up.weight"
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
 #define LLAMA_FILE_VERSION_GGJT_V3   3
 #define TOKENIZER_NAME "llama"
 #define UNKNOWN_TOKEN_ID 0
 #define BOS_TOKEN_ID 1
 #define EOS_TOKEN_ID 2
 //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
 typedef struct {
    int dim; // transformer dimension
@ -27,7 +75,7 @@ typedef struct {
    int seq_len; // max sequence length
 } Config;
-typedef struct {
+struct TransformerWeights {
    // token embedding table
    float* token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
@ -48,10 +96,25 @@ typedef struct {
    // float* freq_cis_real; // (seq_len, dim/2)
    // float* freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
-    //float* wcls;
+    float* wcls;
 } TransformerWeights;
-void malloc_weights(TransformerWeights* w, Config* p) {
+    ~TransformerWeights() {
        delete[] token_embedding_table;
        delete[] rms_att_weight;
        delete[] rms_ffn_weight;
        delete[] wq;
        delete[] wk;
        delete[] wv;
        delete[] wo;
        delete[] w1;
        delete[] w2;
        delete[] w3;
        delete[] rms_final_weight;
        delete[] wcls;
    }
 };
 void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
    // we calloc instead of malloc to keep valgrind happy
    w->token_embedding_table = new float[p->vocab_size * p->dim]();
    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -85,9 +148,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
    w->rms_final_weight = new float[p->dim]();
    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
    if (shared_weights) {
        w->wcls = NULL;
    } else {
        w->wcls = new float[p->vocab_size * p->dim]();
        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
    }
 }
-int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -99,21 +169,23 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
    return 0;
 }
-void free_weights(TransformerWeights* w) {
+    // Skip freq_cis_real & freq_cis_imag
-    delete w->token_embedding_table;
+    int head_size = p->dim / p->n_heads;
-    delete w->rms_att_weight;
+    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
-    delete w->rms_ffn_weight;
+
-    delete w->wq;
+    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    delete w->wk;
+
-    delete w->wv;
+    // Check we didn't forget to read anything
-    delete w->wo;
+    auto curr = ftell(f);
-    delete w->w1;
+    fseek(f, 0, SEEK_END);
-    delete w->w2;
+    auto end = ftell(f);
-    delete w->w3;
+    if (curr != end) {
-    delete w->rms_final_weight;
+        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
        return 1;
    }
    return 0;
 }
 void print_sample_weights(TransformerWeights *w){
@ -130,6 +202,7 @@ void print_sample_weights(TransformerWeights *w){
    printf("%f\n", w->w2[0]);
    printf("%f\n", w->w3[0]);
    printf("%f\n", w->rms_att_weight[0]);
    if (w->wcls) printf("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -138,20 +211,23 @@ void print_sample_weights(TransformerWeights *w){
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
-    struct token_score {
+    struct token_data {
-        token tok;
+        token text;
        float score;
        ttype type;
    };
    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };
 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
    uint32_t n_ff    = 11008;
    uint32_t n_mult  = 4;
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
@ -183,6 +259,8 @@ struct my_llama_layer {
 struct my_llama_model {
    struct ggml_context * ctx = NULL;
    std::string name;
    my_llama_hparams hparams;
    struct ggml_tensor * tok_embeddings;
@ -245,18 +323,13 @@ struct train_params {
    int mem_compute1_gb;
 };
 uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
    return n_ff;
 }
 void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
    printf("%s: n_head:  %d\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
    printf("%s: n_layer: %d\n", __func__, params->n_layer);
    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }
@ -268,7 +341,7 @@ void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff = get_n_ff(&hparams);
+    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;
    model->train_its = 0;
@ -450,21 +523,6 @@ struct llama_file {
        return std::string(chars.data(), len);
    }
    void write_raw(const void * ptr, size_t size) {
        if (size == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, size, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }
    void write_u32(std::uint32_t val) {
        write_raw(&val, sizeof(val));
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
@ -472,84 +530,114 @@ struct llama_file {
    }
 };
 void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
    if (tensor == NULL) {
        file->write_u32(0);
        file->write_u32(0);
        file->write_u32(GGML_TYPE_F32);
        file->seek((0-file->tell()) & 31, SEEK_CUR);
        return;
    }
    const char * name = ggml_get_name(tensor);
    uint32_t name_len = strlen(name);
    uint32_t nd = tensor->n_dims;
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
                       (uint32_t)tensor->ne[3] };
    file->write_u32(nd);
    file->write_u32(name_len);
    file->write_u32(tensor->type);
    file->write_raw(ne, sizeof(ne[0]) * nd);
    file->write_raw(name, name_len);
    file->seek((0-file->tell()) & 31, SEEK_CUR);
    file->write_raw(tensor->data, ggml_nbytes(tensor));
 }
 bool is_ggml_file(const char *filename) {
    llama_file file(filename, "rb");
    if (file.size < 4) {
        return false;
    }
    uint32_t magic = file.read_u32();
-    return magic == LLAMA_FILE_MAGIC;
+    return magic == GGUF_MAGIC;
 }
 static std::string llama_escape_whitespaces(const std::string& text) {
    std::ostringstream out;
    for (char c : text) {
        if (c == ' ') out << "\xe2\x96\x81";
        else out << c;
    }
    return out.str();
 }
 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
    if (is_ggml_file(filename)) {
        struct ggml_context * ctx_data = NULL;
-        struct llama_context_params llama_params = llama_context_default_params();
+        struct gguf_init_params params = {
-        llama_params.vocab_only = true;
+            /*.no_alloc = */ false,
            /*.ctx      = */ &ctx_data,
        };
-        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+        struct gguf_context * ctx = gguf_init_from_file(filename, params);
-        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+        GGML_ASSERT(ctx != NULL);
        const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
        GGML_ASSERT(model_idx >= 0);
        std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
        GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
        const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
        GGML_ASSERT(token_idx >= 0);
        const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
        GGML_ASSERT(score_idx >= 0);
        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
        const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
        GGML_ASSERT(toktype_idx >= 0);
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
        std::vector<const char *> strings;
        std::vector<float> scores;
        int n_vocab = llama_n_vocab(lctx);
        strings.resize(n_vocab, NULL);
        scores.resize(n_vocab, 0);
        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
        vocab->id_to_token.resize(n_vocab);
-        for (int i=0; i<n_vocab; ++i) {
+
-            std::string tok   = std::string(strings[i]);
+        for (uint32_t i = 0; i < n_vocab; i++) {
-            float       score = scores[i];
+            std::string word = gguf_get_arr_str(ctx, token_idx, i);
-            vocab->id_to_token[i].tok   = tok;
+
-            vocab->id_to_token[i].score = score;
+            vocab->token_to_id[word] = i;
-            vocab->token_to_id.emplace(tok, i);
+
            auto & token_data = vocab->id_to_token[i];
            token_data.text  = std::move(word);
            token_data.score = scores[i];
            token_data.type  = (llama_token_type) toktypes[i];
        }
-        llama_free(lctx);
+        ggml_free(ctx_data);
-        llama_free_model(lmodel);
+        gguf_free(ctx);
-    } else { // assume llama2.c vocabulary
+    } else {
-        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+        // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
-        uint32_t n_vocab = config->vocab_size;
+        if (!file.fp) {
            fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
            exit(1);
        }
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (uint32_t i=0; i<n_vocab; ++i) {
+        for (llama_vocab::id id=0; id<n_vocab; ++id) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
-            std::string tok = file.read_string(len);
+            std::string text = file.read_string(len);
-            vocab->id_to_token[i].tok = tok;
+
-            vocab->id_to_token[i].score = score;
+            unsigned char byte_val;
-            vocab->token_to_id.emplace(tok, i);
+            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
            if (id == UNKNOWN_TOKEN_ID) {
                text = "<unk>";
                type = LLAMA_TOKEN_TYPE_UNKNOWN;
            } else if (id == BOS_TOKEN_ID) {
                text = "<s>";
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (id == EOS_TOKEN_ID) {
                text = "</s>";
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (text.empty()) {
                type = LLAMA_TOKEN_TYPE_CONTROL;
            } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
                // Text of byte tokens is already in the expected format.
                type = LLAMA_TOKEN_TYPE_BYTE;
            } else {
                type = LLAMA_TOKEN_TYPE_NORMAL;
            }
            text = llama_escape_whitespaces(text);
            vocab->id_to_token[id].text = text;
            vocab->id_to_token[id].score = score;
            vocab->id_to_token[id].type = type;
            vocab->token_to_id.emplace(text, id);
        }
    }
 }
-void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
    switch (gg_weights->n_dims){
        case 1:
@ -586,84 +674,120 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar
 }
 void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
-    struct llama_file file(filename, "wb");
+    // convert AK weights into GG weights one by one.
    if (file.fp == NULL) {
        return;
    }
    // write_magic
    file.write_u32(LLAMA_FILE_MAGIC);   // magic
    file.write_u32(LLAMA_FILE_VERSION); // version
    // write_hparams
    file.write_u32(model->hparams.n_vocab);
    file.write_u32(model->hparams.n_embd);
    file.write_u32(model->hparams.n_mult);
    file.write_u32(model->hparams.n_head);
    file.write_u32(model->hparams.n_layer);
    file.write_u32(model->hparams.n_rot);
    file.write_u32(LLAMA_FTYPE_ALL_F32);
    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
    uint32_t n_vocab = model->hparams.n_vocab;
    for (uint32_t i = 0; i < n_vocab; i++) {
        const auto & token_score = vocab->id_to_token.at(i);
        file.write_u32((uint32_t) token_score.tok.size());
        file.write_raw(token_score.tok.data(), token_score.tok.size());
        file.write_raw(&token_score.score, sizeof(token_score.score));
    }
    // stuff AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
    //print_row(model->norm, 0);
    // for rms-att-weight
    int row_length = model->hparams.n_embd;
-    const auto & hparams = model->hparams;
+    int n_ff = model->hparams.n_ff;
    //int n_ff = model->hparams.n_embd;
    int n_ff = get_n_ff(&hparams);
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
    }
    struct gguf_context * ctx = gguf_init_empty();
    std::vector<const char*> tokens;
    std::vector<float> scores;
    std::vector<llama_token_type> token_types;
    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
        tokens.push_back(token_data.text.c_str());
        scores.push_back(token_data.score);
        token_types.push_back(token_data.type);
    }
    gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
    gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
    gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
    gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
    // special tokens
    gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
    // n_head_kv is optional, default to n_head
    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
    // write tensors
-    write_tensor(&file, model->tok_embeddings);
+    ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
-    write_tensor(&file, model->norm);
+    gguf_add_tensor(ctx, model->tok_embeddings);
-    write_tensor(&file, model->output); // ?
+
    ggml_set_name(model->norm, TN_OUTPUT_NORM);
    gguf_add_tensor(ctx, model->norm);
    ggml_set_name(model->output, TN_OUTPUT);
    gguf_add_tensor(ctx, model->output);
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
        auto & layer = model->layers[i];
-        write_tensor(&file, layer.attention_norm);
+        ggml_format_name(layer.wq, TN_ATTN_Q, i);
-        write_tensor(&file, layer.wq);
+        gguf_add_tensor(ctx, layer.wq);
-        write_tensor(&file, layer.wk);
+
-        write_tensor(&file, layer.wv);
+        ggml_format_name(layer.wk, TN_ATTN_K, i);
-        write_tensor(&file, layer.wo);
+        gguf_add_tensor(ctx, layer.wk);
-        write_tensor(&file, layer.ffn_norm);
+
-        write_tensor(&file, layer.w1);
+        ggml_format_name(layer.wv, TN_ATTN_V, i);
-        write_tensor(&file, layer.w2);
+        gguf_add_tensor(ctx, layer.wv);
-        write_tensor(&file, layer.w3);
+
        ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
        gguf_add_tensor(ctx, layer.wo);
        ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
        gguf_add_tensor(ctx, layer.attention_norm);
        ggml_format_name(layer.w1, TN_FFN_GATE, i);
        gguf_add_tensor(ctx, layer.w1);
        ggml_format_name(layer.w2, TN_FFN_DOWN, i);
        gguf_add_tensor(ctx, layer.w2);
        ggml_format_name(layer.w3, TN_FFN_UP, i);
        gguf_add_tensor(ctx, layer.w3);
        ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
        gguf_add_tensor(ctx, layer.ffn_norm);
    }
    gguf_write_to_file(ctx, filename, false);
    gguf_free(ctx);
 }
 struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model    = "models/ggml-vocab.bin";
+    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
    params.fn_train_data     = "shakespeare.txt";
    params.fn_checkpoint_in  = "checkpoint.bin";
@ -716,7 +840,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
    fprintf(stderr, "\n");
@ -777,21 +901,32 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
    return true;
 }
 std::string basename(const std::string &path) {
    size_t pos = path.find_last_of("/\\");
    if (pos == std::string::npos) {
        return path;
    }
    return path.substr(pos + 1);
 }
 int main(int argc, char ** argv) {
    struct train_params params = get_default_train_params();
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
    Config config;
-    TransformerWeights weights;
+    TransformerWeights weights = {};
    {
        FILE *file = fopen(params.fn_llama2c_model, "rb");
        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
        // read in the config header
        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
        auto shared_weights = config.vocab_size > 0;
        config.vocab_size = abs(config.vocab_size);
        // read in the Transformer weights
-        malloc_weights(&weights, &config);
+        malloc_weights(&weights, &config, shared_weights);
-        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
        fclose(file);
    }
@ -802,6 +937,7 @@ int main(int argc, char ** argv) {
    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
    model.hparams.n_ctx   = params.n_ctx;
    model.hparams.n_embd  = config.dim; //params.n_embd;
    model.hparams.n_ff    = config.hidden_dim;
    model.hparams.n_mult  = 32;//params.n_mult;
    model.hparams.n_head  = config.n_heads; //params.n_head;
    model.hparams.n_layer = config.n_layers; //params.n_layer;
@ -815,11 +951,11 @@ int main(int argc, char ** argv) {
    model.ctx = ggml_init(lcparams);
    init_model(&model);
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
    ggml_free(model.ctx);
    free_weights(&weights);
    return 0;
 }
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl()];
+        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
-        //     logits[llama_token_nl()] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }
        if (temp <= 0) {
@ -211,10 +211,10 @@ const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
-    if (id == llama_token_eos()) {
+    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
-        ret = llama_token_to_str(ctx, id);
+        ret = llama_token_to_piece(ctx, id);
    }
    eval_id(mymodel, id);
    return ret.c_str();
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import ctypes
 from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
 import numpy as np
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
    int n_past = 0;
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
@ -67,28 +64,35 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }
-    if (params.embedding){
+    if (embd_inp.size() > (size_t)params.n_ctx) {
-        if (embd_inp.size() > 0) {
+        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
-            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+                __func__, embd_inp.size(), params.n_ctx);
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+        return 1;
                return 1;
            }
        }
        const int n_embd = llama_n_embd(ctx);
        const auto embeddings = llama_get_embeddings(ctx);
        for (int i = 0; i < n_embd; i++) {
            printf("%f ", embeddings[i]);
        }
        printf("\n");
    }
    while (!embd_inp.empty()) {
        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        n_past += n_tokens;
        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
    }
    const int n_embd = llama_n_embd(ctx);
    const auto embeddings = llama_get_embeddings(ctx);
    for (int i = 0; i < n_embd; i++) {
        printf("%f ", embeddings[i]);
    }
    printf("\n");
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -0,0 +1,249 @@
 #include "ggml.h"
 #include "llama.h"
 #include <cstdio>
 #include <cinttypes>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <vector>
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
 bool gguf_ex_write(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_empty();
    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
    gguf_set_val_u64 (ctx, "some.parameter.uint64",   0x123456789abcdef0ull);
    gguf_set_val_i64 (ctx, "some.parameter.int64",   -0x123456789abcdef1ll);
    gguf_set_val_f64 (ctx, "some.parameter.float64",  0.1234567890123456789);
    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
    struct ggml_init_params params = {
        /*.mem_size   =*/ 128ull*1024ull*1024ull,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx_data = ggml_init(params);
    const int n_tensors = 10;
    // tensor infos
    for (int i = 0; i < n_tensors; ++i) {
        const std::string name = "tensor_" + to_string(i);
        int64_t ne[GGML_MAX_DIMS] = { 1 };
        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
        for (int j = 0; j < n_dims; ++j) {
            ne[j] = rand() % 10 + 1;
        }
        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
        ggml_set_name(cur, name.c_str());
        {
            float * data = (float *) cur->data;
            for (int j = 0; j < ggml_nelements(cur); ++j) {
                data[j] = 100 + i;
            }
        }
        gguf_add_tensor(ctx, cur);
    }
    gguf_write_to_file(ctx, fname.c_str(), false);
    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 // just read tensor info
 bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ NULL,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // find kv string
    {
        const char * findkey = "some.parameter.string";
        const int keyidx = gguf_find_key(ctx, findkey);
        if (keyidx == -1) {
            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
        } else {
            const char * key_value = gguf_get_val_str(ctx, keyidx);
            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    gguf_free(ctx);
    return true;
 }
 // read and create ggml_context containing the tensors and their data
 bool gguf_ex_read_1(const std::string & fname) {
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx_data,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);
        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);
            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    // tensor info
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    // data
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        for (int i = 0; i < n_tensors; ++i) {
            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
            printf("%s data[:10] : ", name);
            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
                printf("%f ", data[j]);
            }
            printf("\n\n");
            // check data
            {
                const float * data = (const float *) cur->data;
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
                        return false;
                    }
                }
            }
        }
    }
    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
        return -1;
    }
    const std::string fname(argv[1]);
    const std::string mode (argv[2]);
    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
    if (mode == "w") {
        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
    } else if (mode == "r") {
        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
    }
    return 0;
 }
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import matplotlib.pyplot as plt
 import os
 import csv
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import argparse
 import json
 import re
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -3,6 +3,9 @@
 #include <cassert>
 #include <chrono>
 #include <cinttypes>
 #include <clocale>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <iterator>
@ -10,7 +13,6 @@
 #include <numeric>
 #include <regex>
 #include <sstream>
 #include <stdio.h>
 #include <string>
 #include <vector>
@ -18,9 +20,7 @@
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
 // utils
 static uint64_t get_time_ns() {
@ -148,7 +148,7 @@ struct cmd_params {
 };
 static const cmd_params cmd_params_defaults = {
-    /* model         */ {"models/7B/ggml-model-q4_0.bin"},
+    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
@ -179,12 +179,12 @@ static void print_usage(int /* argc */, char ** argv) {
    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
-    fprintf(stdout, "  -ts, --tensor_split <ts>                       \n");
+    fprintf(stdout, "  -ts, --tensor_split <ts0/ts1/..>               \n");
    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
-    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : "md");
+    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    fprintf(stdout, "\n");
-    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by repeating the parameter.\n");
+    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
@ -443,6 +443,8 @@ struct test {
    static const std::string gpu_info;
    std::string model_filename;
    std::string model_type;
    uint64_t model_size;
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
    bool f32_kv;
@ -459,8 +461,10 @@ struct test {
    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
        model_filename = inst.model;
        char buf[128];
-        llama_model_type(lmodel, buf, sizeof(buf));
+        llama_model_desc(lmodel, buf, sizeof(buf));
        model_type = buf;
        model_size = llama_model_size(lmodel);
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
        f32_kv = inst.f32_kv;
@ -504,7 +508,7 @@ struct test {
    static std::string get_backend() {
        if (cuda) {
-            return "CUDA";
+            return GGML_CUDA_NAME;
        }
        if (opencl) {
            return "OpenCL";
@ -526,7 +530,7 @@ struct test {
            "build_commit", "build_number",
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
-            "model_filename", "model_type",
+            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "f16_kv",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
            "n_prompt", "n_gen", "test_time",
@ -540,6 +544,7 @@ struct test {
    static field_type get_field_type(const std::string & field) {
        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" ||
            field == "avg_ns" || field == "stddev_ns") {
@ -575,7 +580,7 @@ struct test {
            build_commit, std::to_string(build_number),
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
-            model_filename, model_type,
+            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
@ -606,6 +611,8 @@ const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 struct printer {
    virtual ~printer() {}
    FILE * fout;
    virtual void print_header(const cmd_params & params) { (void) params; };
    virtual void print_test(const test & t) = 0;
@ -709,8 +716,15 @@ struct markdown_printer : public printer {
            return -30;
        }
        if (field == "t/s") {
-            return 15;
+            return 16;
        }
        if (field == "size" || field == "params") {
            return 10;
        }
        if (field == "n_gpu_layers") {
            return 3;
        }
        int width = std::max((int)field.length(), 10);
        if (test::get_field_type(field) == test::STRING) {
@ -719,14 +733,33 @@ struct markdown_printer : public printer {
        return width;
    }
    static std::string get_field_display_name(const std::string & field) {
        if (field == "n_gpu_layers") {
            return "ngl";
        }
        if (field == "n_threads") {
            return "threads";
        }
        if (field == "mul_mat_q") {
            return "mmq";
        }
        if (field == "tensor_split") {
            return "ts";
        }
        return field;
    }
    void print_header(const cmd_params & params) override {
        // select fields to print
-        fields = { "model", "backend" };
+        fields.push_back("model");
        fields.push_back("size");
        fields.push_back("params");
        fields.push_back("backend");
        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
        if (!is_cpu_backend) {
            fields.push_back("n_gpu_layers");
        }
-        if (params.n_batch.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
+        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
            fields.push_back("n_threads");
        }
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
@ -752,7 +785,7 @@ struct markdown_printer : public printer {
        fprintf(fout, "|");
        for (const auto & field : fields) {
-            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
        }
        fprintf(fout, "\n");
        fprintf(fout, "|");
@ -769,12 +802,26 @@ struct markdown_printer : public printer {
        fprintf(fout, "|");
        for (const auto & field : fields) {
            std::string value;
            char buf[128];
            if (field == "model") {
                value = t.model_type;
            } else if (field == "size") {
                if (t.model_size < 1024*1024*1024) {
                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
                } else {
                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
                }
                value = buf;
            } else if (field == "params") {
                if (t.model_n_params < 1000*1000*1000) {
                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
                } else {
                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
                }
                value = buf;
            } else if (field == "backend") {
                value = test::get_backend();
            } else if (field == "test") {
                char buf[128];
                if (t.n_prompt > 0 && t.n_gen == 0) {
                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
@ -785,7 +832,6 @@ struct markdown_printer : public printer {
                }
                value = buf;
            } else if (field == "t/s") {
                char buf[128];
                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                value = buf;
            } else if (vmap.find(field) != vmap.end()) {
@ -849,7 +895,7 @@ struct sql_printer : public printer {
 };
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
-    std::vector<llama_token> tokens(n_batch, llama_token_bos());
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
    int n_processed = 0;
    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
@ -859,7 +905,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 }
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
-    llama_token token = llama_token_bos();
+    llama_token token = llama_token_bos(ctx);
    for (int i = 0; i < n_gen; i++) {
        llama_eval(ctx, &token, 1, n_past + i, n_threads);
    }
@ -872,6 +918,9 @@ static void llama_null_log_callback(enum llama_log_level level, const char * tex
 }
 int main(int argc, char ** argv) {
    // try to set locale for unicode characters in markdown
    setlocale(LC_CTYPE, ".UTF-8");
 #if !defined(NDEBUG)
    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
 #endif
--- a/examples/llm.vim
+++ b/examples/llm.vim
@ -8,7 +8,7 @@ function! Llm()
  let buffer_content = join(getline(1, '$'), "\n")
  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
  let json_payload.prompt = buffer_content
  " Define the curl command
@ -25,3 +25,4 @@ function! Llm()
 endfunction
 command! Llm call Llm()
 noremap <F2> :Llm<CR>
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@ -45,7 +45,7 @@ User:'
 #### Windows:
 ```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
@ -288,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.
 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
 ### Grammars
 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
 ### Quantization
 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -4,6 +4,7 @@
 #endif
 #include "common.h"
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
@ -17,6 +18,7 @@
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
@ -36,18 +38,67 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static llama_context ** g_ctx;
+static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = get_sortable_timestamp();
    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    dump_string_yaml_multiline(logfile, "output", output.c_str());
    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting) {
-            is_interacting=true;
+            is_interacting = true;
        } else {
            console::cleanup();
            printf("\n");
            llama_print_timings(*g_ctx);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
    }
@ -56,11 +107,21 @@ void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
 #endif // LOG_DISABLE_LOGS
    // TODO: Dump params ?
    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
    console::init(params.simple_io, params.use_color);
@ -83,42 +144,45 @@ int main(int argc, char ** argv) {
    }
    if (params.rope_freq_base != 10000.0) {
-        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
    }
    if (params.rope_freq_scale != 1.0) {
-        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }
    if (params.n_ctx > 2048) {
        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
-        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+        LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
-        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = gpt_random_prompt(rng);
    }
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
    g_model = &model;
    g_ctx = &ctx;
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (params.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@ -126,14 +190,14 @@ int main(int argc, char ** argv) {
    }
    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }
    // print system information
    {
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }
@ -141,9 +205,9 @@ int main(int argc, char ** argv) {
    // uncomment the "used_mem" line in llama.cpp to see the results
    if (params.mem_test) {
        {
-            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
+            LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
        }
@ -167,7 +231,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> session_tokens;
    if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
        // fopen to check for existing session
        FILE * fp = std::fopen(path_session.c_str(), "rb");
@ -177,53 +241,70 @@ int main(int argc, char ** argv) {
            session_tokens.resize(params.n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
            llama_set_rng_seed(ctx, params.seed);
-            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
-            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+            LOG_TEE("%s: session file does not exist, will create\n", __func__);
        }
    }
-    // tokenize the prompt
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);
    std::vector<llama_token> embd_inp;
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+        LOG("tokenize the prompt\n");
        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
    }
    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(ctx));
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    }
    // Tokenize negative prompt
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        params.cfg_negative_prompt.insert(0, 1, ' ');
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
+
        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }
    const int n_ctx = llama_n_ctx(ctx);
    LOG("n_ctx: %d\n", n_ctx);
    if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }
    // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
-    if (session_tokens.size()) {
+    if (session_tokens.size() > 0) {
        for (llama_token id : session_tokens) {
            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                break;
@ -231,22 +312,27 @@ int main(int argc, char ** argv) {
            n_matching_session_tokens++;
        }
        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+            LOG_TEE("%s: using full prompt from session file\n", __func__);
        } else if (n_matching_session_tokens >= embd_inp.size()) {
-            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
-            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
    }
    LOGLN(
            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
    // if we will use the cache for the full prompt without reaching the end of the cache, force
    // reevaluation of the last token token to recalculate the cached logits
-    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-            session_tokens.size() > embd_inp.size()) {
+        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
        session_tokens.resize(embd_inp.size() - 1);
    }
@ -256,8 +342,11 @@ int main(int argc, char ** argv) {
    }
    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
@ -270,34 +359,31 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    // determine newline token
    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        if (ctx_guidance) {
-            fprintf(stderr, "\n");
+            LOG_TEE("\n");
-            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
-            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }
        if (params.n_keep > 0) {
-        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            fprintf(stderr, "'\n");
+            LOG_TEE("'\n");
        }
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
    }
    if (params.interactive) {
@ -311,51 +397,51 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);
        if (params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
+            for (const auto & antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
            }
        }
        if (params.input_prefix_bos) {
-            fprintf(stderr, "Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
        }
        if (!params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
        }
        if (!params.input_suffix.empty()) {
-            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
+    LOG_TEE("\n\n");
    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
-    llama_grammar *             grammar = NULL;
+
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
-        fprintf(stderr, "%s: grammar:\n", __func__);
+        LOG_TEE("%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
-        fprintf(stderr, "\n");
+        LOG_TEE("\n");
        {
-            auto it = params.logit_bias.find(llama_token_eos());
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr,
+                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
@ -365,8 +451,8 @@ int main(int argc, char ** argv) {
    }
    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    if (params.interactive) {
        const char *control_message;
@ -378,11 +464,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        fprintf(stderr, "== Running in interactive mode. ==\n"
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-               "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);
        is_interacting = params.interactive_first;
    }
@ -397,33 +483,37 @@ int main(int argc, char ** argv) {
    int n_session_consumed = 0;
    int n_past_guidance    = 0;
    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
    std::ostringstream output_ss;     g_output_ss     = &output_ss;
    // the first thing we will do is to output the prompt, so set color accordingly
    console::set_display(console::prompt);
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
-    // do one empty run to warm up the model
+    const int n_vocab = llama_n_vocab(ctx);
-    {
+
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
+    std::vector<llama_token_data> candidates;
-        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+    candidates.reserve(n_vocab);
        llama_reset_timings(ctx);
    }
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (embd.size() > 0) {
            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            auto max_embd_size = n_ctx - 4;
+            int max_embd_size = n_ctx - 4;
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int)embd.size() > max_embd_size) {
+            if ((int) embd.size() > max_embd_size) {
-                auto skipped_tokens = embd.size() - max_embd_size;
+                const int skipped_tokens = (int) embd.size() - max_embd_size;
                embd.resize(max_embd_size);
                console::set_display(console::error);
-                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
                fflush(stdout);
                embd.resize(max_embd_size);
            }
            // infinite text generation via context swapping
@ -432,28 +522,26 @@ int main(int argc, char ** argv) {
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                if (params.n_predict == -2) {
-                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
                }
                const int n_left = n_past - params.n_keep;
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
                // always keep the first token - BOS
-                n_past = std::max(1, params.n_keep);
+                n_past          = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
-                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
-                // stop saving session if we run out of context
+                // insert n_left/2 tokens at the start of embd from last_tokens
                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
                LOG("clear session path\n");
                path_session.clear();
                //printf("\n---\n");
                //printf("resetting: '");
                //for (int i = 0; i < (int) embd.size(); i++) {
                //    printf("%s", llama_token_to_str(ctx, embd[i]));
                //}
                //printf("'\n");
                //printf("\n---\n");
            }
            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@ -483,7 +571,7 @@ int main(int argc, char ** argv) {
            if (ctx_guidance) {
                int input_size = 0;
-                llama_token* input_buf = NULL;
+                llama_token * input_buf = NULL;
                if (n_past_guidance < (int) guidance_inp.size()) {
                    // Guidance context should have the same data with these modifications:
@ -499,22 +587,19 @@ int main(int argc, char ** argv) {
                        );
                    }
-                    input_buf = embd_guidance.data();
+                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();
-                    //fprintf(stderr, "\n---------------------\n");
+
-                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
                    //}
                    //fprintf(stderr, "\n---------------------\n");
                } else {
-                    input_buf = embd.data();
+                    input_buf  = embd.data();
                    input_size = embd.size();
                }
                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
-                        fprintf(stderr, "%s : failed to eval\n", __func__);
+                        LOG_TEE("%s : failed to eval\n", __func__);
                        return 1;
                    }
@ -527,11 +612,17 @@ int main(int argc, char ** argv) {
                if (n_eval > params.n_batch) {
                    n_eval = params.n_batch;
                }
                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }
                n_past += n_eval;
                LOG("n_past = %d\n", n_past);
            }
            if (embd.size() > 0 && !path_session.empty()) {
@ -544,101 +635,21 @@ int main(int argc, char ** argv) {
        embd_guidance.clear();
        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // out of user input, sample next token
            const float   temp            = params.temp;
            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
            const float   top_p           = params.top_p;
            const float   tfs_z           = params.tfs_z;
            const float   typical_p       = params.typical_p;
            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
            const float   repeat_penalty  = params.repeat_penalty;
            const float   alpha_presence  = params.presence_penalty;
            const float   alpha_frequency = params.frequency_penalty;
            const int     mirostat        = params.mirostat;
            const float   mirostat_tau    = params.mirostat_tau;
            const float   mirostat_eta    = params.mirostat_eta;
            const bool    penalize_nl     = params.penalize_nl;
            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
                LOG("saved session to %s\n", path_session.c_str());
            }
-            llama_token id = 0;
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
-            {
+            last_tokens.erase(last_tokens.begin());
-                auto logits  = llama_get_logits(ctx);
+            last_tokens.push_back(id);
                auto n_vocab = llama_n_vocab(ctx);
-                // Apply params.logit_bias map
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
                    logits[it->first] += it->second;
                }
                std::vector<llama_token_data> candidates;
                candidates.reserve(n_vocab);
                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                }
                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                if (ctx_guidance) {
                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
                }
                // Apply penalties
                float nl_logit = logits[llama_token_nl()];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, repeat_penalty);
                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
                    logits[llama_token_nl()] = nl_logit;
                }
                if (grammar != NULL) {
                    llama_sample_grammar(ctx, &candidates_p, grammar);
                }
                if (temp <= 0) {
                    // Greedy sampling
                    id = llama_sample_token_greedy(ctx, &candidates_p);
                } else {
                    if (mirostat == 1) {
                        static float mirostat_mu = 2.0f * mirostat_tau;
                        const int mirostat_m = 100;
                        llama_sample_temperature(ctx, &candidates_p, temp);
                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
                    } else if (mirostat == 2) {
                        static float mirostat_mu = 2.0f * mirostat_tau;
                        llama_sample_temperature(ctx, &candidates_p, temp);
                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                    } else {
                        // Temperature sampling
                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                        llama_sample_temperature(ctx, &candidates_p, temp);
                        id = llama_sample_token(ctx, &candidates_p);
                    }
                }
                // printf("`%d`", candidates_p.size);
                if (grammar != NULL) {
                    llama_grammar_accept_token(ctx, grammar, id);
                }
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
            }
            // add it to the context
            embd.push_back(id);
            // echo this to console
@ -646,12 +657,15 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;
            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
+                last_tokens.erase(last_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
+                last_tokens.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@ -662,23 +676,30 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                const std::string token_str = llama_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());
                if (embd.size() > 1) {
                    input_tokens.push_back(id);
                } else {
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
            }
            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
-        if (input_echo && (int)embd_inp.size() == n_consumed) {
+        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // check for reverse prompt
            if (params.antiprompt.size()) {
                std::string last_output;
-                for (auto id : last_n_tokens) {
+                for (auto id : last_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
+                    last_output += llama_token_to_piece(ctx, id);
                }
                is_antiprompt = false;
@ -691,7 +712,7 @@ int main(int argc, char ** argv) {
                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
                        : 0;
-                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
                            console::set_display(console::user_input);
@ -701,10 +722,16 @@ int main(int argc, char ** argv) {
                        break;
                    }
                }
                if (is_antiprompt) {
                    LOG("found antiprompt: %s\n", last_output.c_str());
                }
            }
            // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos()) {
+            if (last_tokens.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");
                if (params.interactive) {
                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
@ -723,16 +750,20 @@ int main(int argc, char ** argv) {
            }
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");
                if (params.instruct) {
                    printf("\n> ");
                }
                if (params.input_prefix_bos) {
-                    embd_inp.push_back(llama_token_bos());
+                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(ctx));
                }
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
                    printf("%s", buffer.c_str());
                }
@ -752,25 +783,43 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }
                    LOG("buffer: '%s'\n", buffer.c_str());
                    const size_t original_size = embd_inp.size();
                    // instruct mode: insert instruction prefix
                    if (params.instruct && !is_antiprompt) {
                        LOG("inserting instruction prefix\n");
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    // instruct mode: insert response suffix
                    if (params.instruct) {
                        LOG("inserting instruction suffix\n");
                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                    }
                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
                        output_ss << llama_token_to_piece(ctx, token);
                    }
                    n_remain -= line_inp.size();
                    LOG("n_remain: %d\n", n_remain);
                } else {
                    LOG("empty line, passing control back\n");
                }
                input_echo = false; // do not echo this again
@ -782,8 +831,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
-                        std::vector<const llama_grammar_element *> grammar_rules(
+                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                            parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
@ -794,24 +842,27 @@ int main(int argc, char ** argv) {
        }
        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
-            fprintf(stderr, " [end of text]\n");
+            LOG_TEE(" [end of text]\n");
            break;
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
            n_remain = params.n_predict;
            is_interacting = true;
        }
    }
    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);
@ -821,5 +872,9 @@ int main(int argc, char ** argv) {
    }
    llama_backend_free();
 #ifndef LOG_DISABLE_LOGS
    LOG_TEE("Log end\n")
 #endif // LOG_DISABLE_LOGS
    return 0;
 }
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 """
 This script converts Hugging Face llama models to GGML and quantizes them.
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -3,14 +3,79 @@
 #include "build-info.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <sstream>
-#include <cstring>
+#include <thread>
 #include <mutex>
 #include <vector>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 struct results_perplexity {
    std::vector<llama_token> tokens;
    double                   ppl_value;
    std::vector<float>       logits;
    std::vector<float>       probs;
 };
 struct results_log_softmax {
    double log_softmax;
    float  logit;
    float  prob;
 };
 void write_logfile(const llama_context * ctx, const gpt_params & params,
                   const llama_model * model, const struct results_perplexity & results) {
    if (params.logdir.empty()) {
        return;
    }
    if (params.hellaswag) {
        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }
    const std::string timestamp = get_sortable_timestamp();
    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Perplexity Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    dump_vector_float_yaml(logfile, "logits", results.logits);
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    dump_vector_float_yaml(logfile, "probs", results.probs);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }
 std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
@ -27,14 +92,86 @@ std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }
-void perplexity(llama_context * ctx, const gpt_params & params) {
+results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
        double & nll, double & nll2, float * logit_history, float * prob_history) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
        double local_nll = 0, local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
            const double v = -results.log_softmax;
            local_nll += v;
            local_nll2 += v*v;
            logit_history[i] = results.logit;
            prob_history[i]  = results.prob;
        }
    };
    for (auto & w : workers) w = std::thread(compute);
    compute();
    for (auto & w : workers) w.join();
 }
 results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-    const int n_chunk_max = tokens.size() / params.n_ctx;
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
    if (int(tokens.size()) < 2*params.n_ctx) {
        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
                params.n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
    std::vector<float>       logit_history;
    std::vector<float>       prob_history;
    logit_history.resize(tokens.size());
    prob_history.resize(tokens.size());
    if (params.ppl_stride <= 0) {
        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }
    const int calc_chunk = params.n_ctx;
    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
    if (int(tokens.size()) <= calc_chunk) {
        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                tokens.size(), params.n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }
    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(ctx);
@ -45,6 +182,133 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.ppl_stride;
        const int end   = start + calc_chunk;
        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
        std::vector<float> logits;
        const auto t_start = std::chrono::high_resolution_clock::now();
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
            // save original token and restore it after eval
            const auto token_org = tokens[batch_start];
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
                tokens[batch_start] = llama_token_bos(ctx);
            }
            const auto batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            if (j == 0) {
                tokens[batch_start] = token_org;
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
        for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
            const std::vector<float> tok_logits(
                logits.begin() + (j + 0) * n_vocab,
                logits.begin() + (j + 1) * n_vocab);
            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
            logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
            prob_history[start + j + 1]  = prob;
            nll += -std::log(prob);
            ++count;
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
        }
        fflush(stdout);
    }
    printf("\n");
    return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
 results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
    if (params.ppl_stride > 0) {
        return perplexity_v2(ctx, params);
    }
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
    if (int(tokens.size()) < 2*params.n_ctx) {
        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
                params.n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
    std::vector<float> logit_history;
    logit_history.resize(tokens.size());
    std::vector<float> prob_history;
    prob_history.resize(tokens.size());
    const int n_chunk_max = tokens.size() / params.n_ctx;
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(ctx);
    const int n_batch = params.n_batch;
    int count = 0;
    double nll = 0.0;
    double nll2 = 0.0;
    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.n_ctx;
        const int end   = start + params.n_ctx;
@ -63,13 +327,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            const auto token_org = tokens[batch_start];
            // add BOS token for the first batch of each chunk
-            if (j == 0) {
+            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos();
+                tokens[batch_start] = llama_token_bos(ctx);
            }
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
+                return {tokens, -1, logit_history, prob_history};
            }
            // restore the original token in case it was set to BOS
@ -104,22 +368,36 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
+        const int first = std::min(512, params.n_ctx/2);
-            // Calculate probability of next token, given the previous ones.
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
-            const std::vector<float> tok_logits(
+                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-                logits.begin() + (j + 0) * n_vocab,
+        count += params.n_ctx - first - 1;
                logits.begin() + (j + 1) * n_vocab);
            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
        // perplexity is e^(average negative log-likelihood)
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        if (params.ppl_output_type == 0) {
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
        }
        fflush(stdout);
    }
    printf("\n");
    nll2 /= count;
    nll /= count;
    const double ppl = exp(nll);
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
        printf("Unexpected negative standard deviation of log(prob)\n");
    }
    return {tokens, ppl, logit_history, prob_history};
 }
 std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@ -177,8 +455,11 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    size_t hs_task_count = prompt_lines.size()/6;
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
    // This is needed as usual for LLaMA models
-    bool prepend_bos = true;
+    const bool add_bos = is_spm;
    // Number of tasks to use when computing the score
    if ( params.hellaswag_tasks < hs_task_count  ) {
@ -216,7 +497,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_data[i].context = prompt_lines[idx*6];
        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j=0; j < 4; j++) {
-            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
        }
        // Delete the selected random example from the prompt
@ -231,19 +512,30 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);
    std::vector<std::vector<int>> ending_tokens(4);
    std::vector<float> tok_logits(n_vocab);
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
        // Tokenize the context to count tokens
-        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
+        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
        size_t context_size = context_embd.size();
        for (int i = 0; i < 4; ++i) {
            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
            for (int k = 0; k < int(context_size); ++k) {
                if (ending_tokens[i][k] != context_embd[k]) {
                    fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
                    break;
                }
            }
        }
        // Do the 1st ending
        // In this case we include the context when evaluating
-        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
+        //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
        auto query_embd = ending_tokens[0];
        auto query_size = query_embd.size();
        //printf("First query: %d\n",(int)query_size);
        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
@ -288,7 +580,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
            // Tokenize the query
-            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_embd.resize(ending_tokens[ending_idx].size() - context_size);
            std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
            query_size = query_embd.size();
            // Stop if query wont fit the ctx window
@ -369,6 +662,12 @@ int main(int argc, char ** argv) {
    params.perplexity = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    if (params.ppl_stride > 0) {
        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                params.n_ctx, params.n_ctx + params.ppl_stride/2);
        params.n_ctx += params.ppl_stride/2;
    }
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
@ -406,13 +705,16 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }
    struct results_perplexity results;
    if (params.hellaswag) {
        hellaswag_score(ctx, params);
    } else {
-        perplexity(ctx, params);
+        results = perplexity(ctx, params);
    }
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, results);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -24,7 +24,7 @@
 #endif
 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -14,27 +14,29 @@ struct quant_option {
 };
 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
 #ifdef GGML_USE_K_QUANTS
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
 #endif
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
@ -68,15 +70,20 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
-    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    fprintf(stderr, "\nAllowed quantization types:\n");
+    printf("\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
-        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+        if (it.name != "COPY") {
            printf("  %2d  or  ", it.ftype);
        } else {
            printf("          ");
        }
        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
    }
    exit(1);
 }
@ -100,7 +107,7 @@ int main(int argc, char ** argv) {
        }
    }
-    if (argc - arg_idx < 3) {
+    if (argc - arg_idx < 2) {
        usage(argv[0]);
    }
@ -114,13 +121,16 @@ int main(int argc, char ** argv) {
    std::string ftype_str;
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
-        const size_t pos = fname_inp.find_last_of('/');
+        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
+        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    }
    else {
        fname_out = argv[arg_idx];
@ -133,6 +143,10 @@ int main(int argc, char ** argv) {
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        } else {
            if (ftype_str == "COPY") {
               params.only_copy = true;
            }
        }
        arg_idx++;
    }
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@ -1,4 +1,3 @@
 #!/bin/bash
 cd `dirname $0`
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx     = params.n_ctx;
    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@ -89,10 +87,10 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx, next_token);
+        auto next_token_str = llama_token_to_piece(ctx, next_token);
        last_n_tokens_data.push_back(next_token);
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -149,10 +147,10 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        auto next_token_str = llama_token_to_piece(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);
-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -48,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```
 ### Windows:
 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
+server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
@ -78,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
 ```bash
 mkdir llama-client
 cd llama-client
 npm init
 npm install axios
 ```
 Create a index.js file and put inside this:
 ```javascript
 const axios = require("axios");
 const prompt = `Building a website can be done in 10 simple steps:`;
 async function Test() {
-    let result = await axios.post("http://127.0.0.1:8080/completion", {
+    let response = await fetch("http://127.0.0.1:8080/completion", {
-        prompt,
+        method: 'POST',
-        n_predict: 512,
+        body: JSON.stringify({
-    });
+            prompt,
-
+            n_predict: 512,
-    // the response is received until completion finish
+        })
-    console.log(result.data.content);
+    })
    console.log((await response.json()).content)
 }
-Test();
+Test()
 ```
 And run it:
 ```bash
-node .
+node index.js
 ```
 ## API Endpoints
@ -127,7 +123,7 @@ node .
    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
-    `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
+    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@ -168,6 +164,12 @@ node .
    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
 -   **POST** `/detokenize`: Convert tokens to text.
    *Options:*
    `tokens`: Set the tokens to detokenize.
 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
    *Options:*
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import argparse
 from flask import Flask, jsonify, request, Response
 import urllib.parse
--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -102,6 +102,17 @@
      padding: 0.5em;
    }
    .prob-set {
      padding: 0.3em;
      border-bottom: 1px solid #ccc;
    }
    .popover-content {
      position: absolute;
      background-color: white;
      padding: 0.2em;
      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    }
    textarea {
      padding: 5px;
@ -133,11 +144,39 @@
      font-size: 80%;
      color: #888;
    }
    @keyframes loading-bg-wipe {
      0% {
        background-position: 0%;
      }
      100% {
        background-position: 100%;
      }
    }
    .loading {
      --loading-color-1: #eeeeee00;
      --loading-color-2: #eeeeeeff;
      background-size: 50% 100%;
      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
      animation: loading-bg-wipe 2s linear infinite;
    }
    @media (prefers-color-scheme: dark) {
      .loading {
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
      .popover-content {
        background-color: black;
      }
    }
  </style>
  <script type="module">
    import {
-      html, h, signal, effect, computed, render, useSignal, useEffect, useRef
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
    } from '/index.js';
    import { llama } from '/completion.js';
@ -168,6 +207,7 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
      n_probs: 0, // no completion_probabilities
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -303,7 +343,10 @@
    const llamaStats = signal(null)
    const controller = signal(null)
-    const generating = computed(() => controller.value == null )
+    // currently generating a completion?
    const generating = computed(() => controller.value != null)
    // has the user started a chat?
    const chatStarted = computed(() => session.value.transcript.length > 0)
    const transcriptUpdate = (transcript) => {
@ -334,10 +377,21 @@
      const prompt = template(session.value.template, {
        message: msg,
-        history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
+        history: session.value.transcript.flatMap(
          ([name, data]) =>
            template(
              session.value.historyTemplate,
              {
                name,
                message: Array.isArray(data) ?
                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
                  data,
              }
            )
        ).join("\n"),
      });
-      let currentMessage = '';
+      const currentMessages = [];
      const history = session.value.transcript
      const llamaParams = {
@ -347,15 +401,19 @@
      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;
        currentMessage += data.content;
        // remove leading whitespace
        currentMessage = currentMessage.replace(/^\s+/, "")
        transcriptUpdate([...history, ["{{char}}", currentMessage]])
        if (data.stop) {
-          console.log("Completion finished: '", currentMessage, "', summary: ", data);
+          while (
            currentMessages.length > 0 &&
            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
          ) {
            currentMessages.pop();
          }
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
        }
        if (data.timings) {
@ -397,11 +455,19 @@
      return html`
        <form onsubmit=${submit}>
          <div>
-            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
+            <textarea
               className=${generating.value ? "loading" : null}
               oninput=${(e) => message.value = e.target.value}
               onkeypress=${enterSubmits}
               placeholder="Say something..."
               rows=2
               type="text"
               value="${message}"
            />
          </div>
          <div class="right">
-            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${stop} disabled=${generating}>Stop</button>
+            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
        </form>
@ -420,8 +486,18 @@
        }
      }, [messages])
-      const chatLine = ([user, msg]) => {
+      const chatLine = ([user, data], index) => {
-        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
+        let message
        const isArrayMessage = Array.isArray(data)
        if (params.value.n_probs > 0 && isArrayMessage) {
          message = html`<${Probabilities} data=${data} />`
        } else {
          const text = isArrayMessage ?
            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
      };
      return html`
@ -568,10 +644,71 @@
              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
            </fieldset>
            <fieldset>
              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
            </fieldset>
          </details>
        </form>
      `
    }
    const probColor = (p) => {
      const r = Math.floor(192 * (1 - p));
      const g = Math.floor(192 * p);
      return `rgba(${r},${g},0,0.3)`;
    }
    const Probabilities = (params) => {
      return params.data.map(msg => {
        const { completion_probabilities } = msg;
        if (
          !completion_probabilities ||
          completion_probabilities.length === 0
        ) return msg.content
        if (completion_probabilities.length > 1) {
          // Not for byte pair
          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
          const splitData = completion_probabilities.map(prob => ({
            content: prob.content,
            completion_probabilities: [prob]
          }))
          return html`<${Probabilities} data=${splitData} />`
        }
        const { probs, content } = completion_probabilities[0]
        const found = probs.find(p => p.tok_str === msg.content)
        const pColor = found ? probColor(found.prob) : 'transparent'
        const popoverChildren = html`
          <div class="prob-set">
            ${probs.map((p, index) => {
              return html`
                <div
                  key=${index}
                  title=${`prob: ${p.prob}`}
                  style=${{
                    padding: '0.3em',
                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
                  }}
                >
                  <span>${p.tok_str}: </span>
                  <span>${Math.floor(p.prob * 100)}%</span>
                </div>
              `
            })}
          </div>
        `
        return html`
          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
          </>
        `
      });
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
@ -600,10 +737,121 @@
      `
    }
    // simple popover impl
    const Popover = (props) => {
      const isOpen = useSignal(false);
      const position = useSignal({ top: '0px', left: '0px' });
      const buttonRef = useRef(null);
      const popoverRef = useRef(null);
      const togglePopover = () => {
        if (buttonRef.current) {
          const rect = buttonRef.current.getBoundingClientRect();
          position.value = {
            top: `${rect.bottom + window.scrollY}px`,
            left: `${rect.left + window.scrollX}px`,
          };
        }
        isOpen.value = !isOpen.value;
      };
      const handleClickOutside = (event) => {
        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
          isOpen.value = false;
        }
      };
      useEffect(() => {
        document.addEventListener('mousedown', handleClickOutside);
        return () => {
          document.removeEventListener('mousedown', handleClickOutside);
        };
      }, []);
      return html`
        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
        ${isOpen.value && html`
          <${Portal} into="#portal">
            <div
              ref=${popoverRef}
              class="popover-content"
              style=${{
                top: position.value.top,
                left: position.value.left,
              }}
            >
              ${props.popoverChildren}
            </div>
          </${Portal}>
        `}
      `;
    };
    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
    /** Redirect rendering of descendants into the given CSS selector */
    class Portal extends Component {
      componentDidUpdate(props) {
        for (let i in props) {
          if (props[i] !== this.props[i]) {
            return setTimeout(this.renderLayer);
          }
        }
      }
      componentDidMount() {
        this.isMounted = true;
        this.renderLayer = this.renderLayer.bind(this);
        this.renderLayer();
      }
      componentWillUnmount() {
        this.renderLayer(false);
        this.isMounted = false;
        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
      }
      findNode(node) {
        return typeof node === 'string' ? document.querySelector(node) : node;
      }
      renderLayer(show = true) {
        if (!this.isMounted) return;
        // clean up old node if moving bases:
        if (this.props.into !== this.intoPointer) {
          this.intoPointer = this.props.into;
          if (this.into && this.remote) {
            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
          }
          this.into = this.findNode(this.props.into);
        }
        this.remote = render(html`
          <${PortalProxy} context=${this.context}>
            ${show && this.props.children || null}
          </${PortalProxy}>
        `, this.into, this.remote);
      }
      render() {
        return null;
      }
    }
    // high-order component that renders its first child if it exists.
    // used as a conditional rendering proxy.
    class PortalProxy extends Component {
      getChildContext() {
        return this.props.context;
      }
      render({ children }) {
        return children || null;
      }
    }
    function App(props) {
      return html`
-        <div id="container">
+        <div>
          <header>
            <h1>llama.cpp</h1>
          </header>
@ -624,11 +872,13 @@
      `;
    }
-    render(h(App), document.body);
+    render(h(App), document.querySelector('#container'));
  </script>
 </head>
 <body>
  <div id="container"></div>
  <div id="portal"></div>
 </body>
 </html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -17,6 +17,8 @@
 #include "completion.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
 #include <cstddef>
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@ -94,7 +96,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += llama_token_to_str(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@ -123,9 +125,10 @@ static void server_log(const char *level, const char *function, int line,
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if first bit is 1, meaning it's a partial character
+    // if the size is 1 and first bit is 1, meaning it's a partial character
-    if (out.size() > 0 && (out[0] & 0x80) == 0x80)
+    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
@ -190,6 +193,7 @@ struct llama_server_context
    size_t n_past = 0;
    size_t n_remain = 0;
    json prompt;
    std::vector<llama_token> embd;
    std::vector<llama_token> last_n_tokens;
@ -267,6 +271,51 @@ struct llama_server_context
        return true;
    }
    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
    {
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
        std::vector<llama_token> prompt_tokens;
        if (json_prompt.is_array())
        {
            bool first = true;
            for (const auto& p : json_prompt)
            {
                if (p.is_string())
                {
                    auto s = p.template get<std::string>();
                    std::vector<llama_token> p;
                    if (first)
                    {
                        p = ::llama_tokenize(ctx, s, add_bos);
                        first = false;
                    }
                    else
                    {
                        p = ::llama_tokenize(ctx, s, false);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
                else
                {
                    if (first)
                    {
                        first = false;
                    }
                    prompt_tokens.push_back(p.template get<llama_token>());
                }
            }
        }
        else
        {
            auto s = json_prompt.template get<std::string>();
            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
        }
        return prompt_tokens;
    }
    bool loadGrammar()
    {
        if (!params.grammar.empty()) {
@ -279,7 +328,7 @@ struct llama_server_context
            grammar_parser::print_grammar(stderr, parsed_grammar);
            {
-                auto it = params.logit_bias.find(llama_token_eos());
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
                if (it != params.logit_bias.end() && it->second == -INFINITY) {
                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                }
@ -294,8 +343,8 @@ struct llama_server_context
    void loadPrompt()
    {
-        params.prompt.insert(0, 1, ' '); // always add a first space
+        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
-        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+
        num_prompt_tokens = prompt_tokens.size();
        if (params.n_keep < 0)
@ -402,7 +451,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
            return result;
        }
@ -442,7 +491,7 @@ struct llama_server_context
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
            // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -452,7 +501,7 @@ struct llama_server_context
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
            }
            if (grammar != nullptr) {
@ -515,9 +564,9 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;
-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
-            // stopping_word = llama_token_to_str(ctx, embd.back());
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
            has_next_token = false;
            stopped_eos = true;
            LOG_VERBOSE("eos token found", {});
@ -564,7 +613,7 @@ struct llama_server_context
    {
        const completion_token_output token_with_probs = nextToken();
-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;
        if (params.n_probs > 0)
@ -652,8 +701,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -673,12 +720,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@ -774,23 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "-gqa" || arg == "--gqa")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        }
        else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@ -886,12 +915,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
 #endif // GGML_USE_CUBLAS
        }
-        else if (arg == "--mul-mat-q" || arg == "-mmq")
+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
        {
 #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
 #else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
@ -968,7 +997,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);
@ -1011,7 +1040,7 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);
-    assert(timings.n_eval == llama.num_tokens_predicted);
+    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
    return json{
        {"prompt_n", timings.n_p_eval},
@ -1036,7 +1065,7 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"tokens_predicted", llama.num_tokens_predicted},
        {"tokens_evaluated", llama.num_prompt_tokens},
        {"generation_settings", format_generation_settings(llama)},
-        {"prompt", llama.params.prompt},
+        {"prompt", llama.prompt},
        {"truncated", llama.truncated},
        {"stopped_eos", llama.stopped_eos},
        {"stopped_word", llama.stopped_word},
@ -1075,35 +1104,58 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }
 static json format_detokenized_response(std::string content)
 {
    return json{
        {"content", content}};
 }
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;
-    llama.stream = body.value("stream", false);
+    llama.stream = json_value(body, "stream", false);
-    llama.params.n_predict = body.value("n_predict", default_params.n_predict);
+    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    llama.params.top_k = body.value("top_k", default_params.top_k);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
-    llama.params.top_p = body.value("top_p", default_params.top_p);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
-    llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
-    llama.params.typical_p = body.value("typical_p", default_params.typical_p);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
-    llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
-    llama.params.temp = body.value("temperature", default_params.temp);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
-    llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
-    llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
-    llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
-    llama.params.mirostat = body.value("mirostat", default_params.mirostat);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
-    llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
-    llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
-    llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
-    llama.params.n_keep = body.value("n_keep", default_params.n_keep);
+    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
-    llama.params.seed = body.value("seed", default_params.seed);
+    llama.params.seed = json_value(body, "seed", default_params.seed);
-    llama.params.prompt = body.value("prompt", default_params.prompt);
+    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    llama.params.grammar = body.value("grammar", default_params.grammar);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
-    llama.params.n_probs = body.value("n_probs", default_params.n_probs);
+
    if (body.count("prompt") != 0)
    {
        llama.prompt = body["prompt"];
    }
    else
    {
        llama.prompt = "";
    }
    llama.params.logit_bias.clear();
-    if (body.value("ignore_eos", false))
+    if (json_value(body, "ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }
    const auto &logit_bias = body.find("logit_bias");
@ -1163,6 +1215,62 @@ static void log_server_request(const Request &req, const Response &res)
                           });
 }
 bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
 }
 // Function matching type llama_beam_search_callback_fn_t.
 // Custom callback example is called each time the beams lengths increase:
 //  * Show progress by printing ',' following by number of convergent beam tokens if any.
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
 void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
    auto & llama = *static_cast<llama_server_context*>(callback_data);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
        llama_beam_view& beam_view = beams_state.beam_views[i];
        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
            beam_view.eob = true;
        }
    }
    printf(",");  // Show progress
    if (const size_t n = beams_state.common_prefix_length) {
        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
        printf("%zu", n);
    }
    fflush(stdout);
 #if 0 // DEBUG: print current beams for this iteration
    std::cout << "\n\nCurrent beams:\n";
    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
    }
 #endif
 }
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
 };
 void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
    auto & gtps = llama.generated_token_probs;
    auto translator = token_translator{llama.ctx};
    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
        llama.generated_text.reserve(llama.generated_text.size() + len);
    }
    for (const completion_token_output & cto : gtps) {
        llama.generated_text += translator(cto);
    }
 }
 int main(int argc, char **argv)
 {
    // own arguments required by this example
@ -1245,25 +1353,39 @@ int main(int argc, char **argv)
        llama.beginCompletion();
        if (!llama.stream) {
-            size_t stop_pos = std::string::npos;
+            if (llama.params.n_beams) {
                // Fill llama.generated_token_probs vector with final beam.
                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
                                  llama.n_past, llama.n_remain, llama.params.n_threads);
                // Translate llama.generated_token_probs to llama.generated_text.
                append_to_generated_text_from_generated_token_probs(llama);
            } else {
                size_t stop_pos = std::string::npos;
-            while (llama.has_next_token) {
+                while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
+                    const completion_token_output token_with_probs = llama.doCompletion();
-                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
-                stop_pos = llama.findStoppingStrings(llama.generated_text,
+                    stop_pos = llama.findStoppingStrings(llama.generated_text,
-                    token_text.size(), STOP_FULL);
+                        token_text.size(), STOP_FULL);
                }
                if (stop_pos == std::string::npos) {
                    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
                }
                if (stop_pos != std::string::npos) {
                    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
                        llama.generated_text.end());
                }
            }
-            if (stop_pos == std::string::npos) {
+            auto probs = llama.generated_token_probs;
-                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+            if (llama.params.n_probs > 0 && llama.stopped_word) {
-            }
+                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
-            if (stop_pos != std::string::npos) {
+                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
                    llama.generated_text.end());
            }
-            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
+            const json data = format_final_response(llama, llama.generated_text, probs);
            llama_print_timings(llama.ctx);
@ -1276,59 +1398,90 @@ int main(int argc, char **argv)
                while (llama.has_next_token) {
                    const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                    if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                    if (llama.multibyte_pending > 0) {
                        continue;
                    }
                    const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
                    size_t pos = std::min(sent_count, llama.generated_text.size());
                    const std::string str_test = llama.generated_text.substr(pos);
                    bool is_stop_full = false;
                    size_t stop_pos =
                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
                    if (stop_pos != std::string::npos) {
                        is_stop_full = true;
                        llama.generated_text.erase(
                            llama.generated_text.begin() + pos + stop_pos,
                            llama.generated_text.end());
                        pos = std::min(sent_count, llama.generated_text.size());
                    } else {
                        is_stop_full = false;
                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
                            STOP_PARTIAL);
                    }
-                    const std::string to_send = llama.generated_text.substr(pos, stop_pos);
+                    if (
-                    sent_count += to_send.size();
+                        stop_pos == std::string::npos ||
                        // Send rest of the text if we are at the end of the generation
                        (!llama.has_next_token && !is_stop_full && stop_pos > 0)
                    ) {
                        const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
-                    std::vector<completion_token_output> probs_output = {};
+                        sent_count += to_send.size();
-                    if (llama.params.n_probs > 0) {
+                        std::vector<completion_token_output> probs_output = {};
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                        if (llama.params.n_probs > 0) {
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        if (probs_pos < probs_stop_pos) {
+                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
                            if (probs_pos < probs_stop_pos) {
                                probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
                            }
                            sent_token_probs_index = probs_stop_pos;
                        }
                        const json data = format_partial_response(llama, to_send, probs_output);
                        const std::string str =
                            "data: " +
                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
                            "\n\n";
                        LOG_VERBOSE("data stream", {
                            { "to_send", str }
                        });
                        if (!sink.write(str.data(), str.size())) {
                            LOG_VERBOSE("stream closed", {});
                            llama_print_timings(llama.ctx);
                            return false;
                        }
                        sent_token_probs_index = probs_stop_pos;
                    }
-                    const json data = llama.has_next_token
+                    if (!llama.has_next_token) {
-                                          ? format_partial_response(llama, to_send, probs_output)
+                        // Generation is done, send extra information.
-                                          // Generation is done, send extra information.
+                        const json data = format_final_response(
-                                          : format_final_response(llama, to_send, llama.generated_token_probs);
+                            llama,
                            "",
                            std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
                        );
-                    const std::string str =
+                        const std::string str =
-                        "data: " +
+                            "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
+                            "\n\n";
-                    LOG_VERBOSE("data stream", {
+                        LOG_VERBOSE("data stream", {
-                        { "to_send", str }
+                            { "to_send", str }
-                    });
+                        });
-                    if (!sink.write(str.data(), str.size())) {
+                        if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
+                            LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
+                            llama_print_timings(llama.ctx);
-                        return false;
+                            return false;
                        }
                    }
                }
@ -1356,11 +1509,29 @@ int main(int argc, char **argv)
        auto lock = llama.lock();
        const json body = json::parse(req.body);
-        const std::string content = body.value("content", "");
+        std::vector<llama_token> tokens;
-        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
+        if (body.count("content") != 0)
        {
            tokens = llama.tokenize(body["content"], false);
        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });
    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
        const json body = json::parse(req.body);
        std::string content;
        if (body.count("tokens") != 0)
        {
            const std::vector<llama_token> tokens = body["tokens"];
            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
        }
        const json data = format_detokenized_response(content);
        return res.set_content(data.dump(), "application/json"); });
    svr.Post("/embedding", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@ -1369,7 +1540,14 @@ int main(int argc, char **argv)
        llama.rewind();
        llama_reset_timings(llama.ctx);
-        llama.params.prompt = body.value("content", "");
+        if (body.count("content") != 0)
        {
            llama.prompt = body["content"];
        }
        else
        {
            llama.prompt = "";
        }
        llama.params.n_predict = 0;
        llama.loadPrompt();
        llama.beginCompletion();
@ -1382,7 +1560,7 @@ int main(int argc, char **argv)
    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
                              {
-        const auto * fmt = "500 Internal Server Error\n%s";
+        const char fmt[] = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
        try {
            std::rethrow_exception(std::move(ep));
@ -1398,7 +1576,7 @@ int main(int argc, char **argv)
                          {
        if (res.status == 400) {
            res.set_content("Invalid request", "text/plain");
-        } else {
+        } else if (res.status != 500) {
            res.set_content("File Not Found", "text/plain");
            res.status = 404;
        } });
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
-#include <cassert>
+#include "common.h"
-#include <cinttypes>
+#include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+int main(int argc, char ** argv) {
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
 int main(int argc, char ** argv)
 {
    gpt_params params;
-    //---------------------------------
+    if (argc == 1 || argv[1][0] == '-') {
-    // Print help :
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
    //---------------------------------
    if ( argc == 1 || argv[1][0] == '-' )
    {
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
        return 1 ;
    }
-    //---------------------------------
+    if (argc >= 2) {
    // Load parameters :
    //---------------------------------
    if ( argc >= 2 )
    {
        params.model = argv[1];
    }
-    if ( argc >= 3 )
+    if (argc >= 3) {
    {
        params.prompt = argv[2];
    }
-    if ( params.prompt.empty() )
+    if (params.prompt.empty()) {
    {
        params.prompt = "Hello my name is";
    }
-    //---------------------------------
+    // init LLM
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
-    llama_model * model;
+    llama_context_params ctx_params = llama_context_default_params();
    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
-    if ( model == NULL )
+    if (model == NULL) {
-    {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-    // Tokenize the prompt :
+
-    //---------------------------------
+    // tokenize the prompt
    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_tokens_list_size = max_context_size - 4;
-    if ( (int)tokens_list.size() > max_tokens_list_size )
+    if ((int) tokens_list.size() > max_tokens_list_size) {
-    {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
        return 1;
    }
-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");
-    // Print the tokens from the prompt :
+    for (auto id : tokens_list) {
-
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    for( auto id : tokens_list )
    {
        printf( "%s" , llama_token_to_str( ctx , id ) );
    }
-    fflush(stdout);
+    fflush(stderr);
-
+    // main loop
    //---------------------------------
    // Main prediction loop :
    //---------------------------------
    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
+    const int n_gen = std::min(32, max_context_size);
    {
        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------
-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
-        {
+        // evaluate the transformer
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        tokens_list.clear();
-        //---------------------------------
+        // sample the next token
        // Select the best prediction :
        //---------------------------------
        llama_token new_token_id = 0;
-        auto logits  = llama_get_logits( ctx );
+        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);
-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
+        if (new_token_id == llama_token_eos(ctx)) {
        {
            fprintf(stderr, " [end of text]\n");
            break;
        }
-        // Print the new token :
+        // print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-        fflush( stdout );
+        fflush(stdout);
-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
-        tokens_list.push_back( new_token_id );
+        tokens_list.push_back(new_token_id);
    }
-    } // wend of main loop
+    llama_free(ctx);
-
+    llama_free_model(model);
    llama_free( ctx );
    llama_free_model( model );
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
 // EOF
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET speculative)
 add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -0,0 +1,234 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv) {
    gpt_params params;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (params.model_draft.empty()) {
        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
        return 1;
    }
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
 #endif // LOG_DISABLE_LOGS
    // init llama.cpp
    llama_backend_init(params.numa);
    llama_model * model_tgt = NULL;
    llama_model * model_dft = NULL;
    llama_context * ctx_tgt = NULL;
    llama_context * ctx_dft = NULL;
    // load the target model
    params.perplexity = true; // HACK: enable logits_all = true
    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
    // load the draft model
    params.model = params.model_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
    // tokenize the prompt
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
    const int max_context_size     = llama_n_ctx(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;
    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }
    fprintf(stderr, "\n\n");
    for (auto id : inp) {
        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
    }
    fflush(stderr);
    const int n_input = inp.size();
    const auto t_enc_start = ggml_time_us();
    // eval the prompt with both models
    llama_eval(ctx_tgt,  inp.data(), int(inp.size() - 1), 0, params.n_threads);
    llama_eval(ctx_tgt, &inp.back(),      1, inp.size() - 1, params.n_threads);
    llama_eval(ctx_dft,  inp.data(),     int(inp.size()), 0, params.n_threads);
    const auto t_enc_end = ggml_time_us();
    // the 2 models should have the same vocab
    const int n_ctx   = llama_n_ctx(ctx_tgt);
    const int n_vocab = llama_n_vocab(ctx_tgt);
    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
    // how many tokens to draft each time
    const int n_draft = params.n_draft;
    int n_predict = 0;
    int n_drafted = 0;
    int n_accept  = 0;
    int n_past_tgt = inp.size();
    int n_past_dft = inp.size();
    std::vector<llama_token> drafted;
    std::vector<llama_token> last_tokens(n_ctx);
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    for (auto & id : inp) {
        last_tokens.erase(last_tokens.begin());
        last_tokens.push_back(id);
    }
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    // used to determine end of generation
    bool has_eos = false;
    const auto t_dec_start = ggml_time_us();
    while (true) {
        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
        // sample from the drafted tokens if any
        int i_dft = 0;
        while (true) {
            const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft);
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);
            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
            printf("%s", token_str.c_str());
            fflush(stdout);
            if (id == llama_token_eos(ctx_tgt)) {
                has_eos = true;
            }
            ++n_predict;
            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
                LOG("drafted token %d accepted\n", id);
                ++n_accept;
                ++n_past_tgt;
                ++n_past_dft;
                ++i_dft;
                continue;
            }
            // the drafted token was rejected or we are out of drafted tokens
            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
            ++n_past_dft;
            drafted.clear();
            drafted.push_back(id);
            break;
        }
        if (n_predict > params.n_predict || has_eos) {
            break;
        }
        // sample n_draft tokens from the draft model picking the best token
        int n_past_cur = n_past_dft;
        for (int i = 0; i < n_draft; ++i) {
            float * logits = llama_get_logits(ctx_dft);
            candidates.clear();
            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
            }
            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
            // computes softmax and sorts the candidates
            llama_sample_softmax(ctx_dft, &cur_p);
            for (int i = 0; i < 3; ++i) {
                LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p);
            }
            // too low probability, stop drafting
            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
                break;
            }
            drafted.push_back(cur_p.data[0].id);
            ++n_drafted;
            if (i < n_draft - 1) {
                // evaluate the drafted token on the draft model
                llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
                ++n_past_cur;
            }
        }
        // evaluate the target model on the drafted tokens
        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
        ++n_past_tgt;
        drafted.erase(drafted.begin());
    }
    auto t_dec_end = ggml_time_us();
    LOG_TEE("\n\n");
    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
    // TODO: make sure these numbers are computed correctly
    LOG_TEE("\n");
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
    LOG_TEE("n_drafted = %d\n", n_drafted);
    LOG_TEE("n_accept  = %d\n", n_accept);
    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
    LOG_TEE("\ndraft:\n");
    llama_print_timings(ctx_dft);
    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx_tgt);
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);
    llama_free(ctx_dft);
    llama_free_model(model_dft);
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 # train
 ./bin/train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab.bin \
+        --vocab-model ../models/ggml-vocab-llama.gguf \
        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.bin \
+        --checkpoint-in  chk-shakespeare-256x16.gguf \
-        --checkpoint-out chk-shakespeare-256x16.bin \
+        --checkpoint-out chk-shakespeare-256x16.gguf \
-        --model-out ggml-shakespeare-256x16-f32.bin \
+        --model-out ggml-shakespeare-256x16-f32.gguf \
        --train-data "shakespeare.txt" \
-        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
+        -t 6 -b 16 --seed 1 --adam-iter 256 \
-        --print-details-interval 0 --predict 16 --use-flash
+        --no-checkpointing
 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.bin
+./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -0,0 +1,495 @@
 #!/usr/bin/env python3
 # train-text-from-scratch checkpoint --> gguf conversion
 import argparse
 import os
 import struct
 import sys
 import numpy as np
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
 import gguf
 # gguf constants
 LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
 LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
 LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
 LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
 LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
 LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
 LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
 LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
 LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
 LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
 LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
 LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
 LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
 LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
 LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
 LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
 LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
 LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
 LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
 LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
 LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
 LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
 LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
 LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
 LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
 LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
 LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
 class Tensor:
    def __init__(self, dtype='f', ne=None):
        if ne is None:
            ne = []
        self.dtype = dtype
        self.ne = ne
        self.nbytes = 0
        if self.dtype == 'f':
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
                self.nbytes = int(np.product(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
    def load(self, data, offset):
        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        assert(nd == len(self.ne))
        ne = []
        for d in range(nd):
            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
            ne.append(n)
        assert(tuple(ne) == tuple(self.ne))
        if self.dtype == 'f':
            assert(dtype == 0)
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
        self.name = bytes(data[offset:offset+namelen]); offset += namelen
        # 32-byte alignment
        offset += (0 - offset) & 31
        self.data = data[offset:offset+self.nbytes]
        offset += self.nbytes
        return offset
    def max_storage_size(self):
        result = 0
        result += 4 # nd
        result += 4 # namelen
        result += 4 # dtype
        result += len(self.ne)*8 # ne
        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
        result += 31 # 32-byte alignment
        result += self.nbytes
        return result
    def save_gguf(self, gguf_writer, name):
        gguf_writer.add_tensor(
            name=name,
            tensor=self.data,
            raw_shape=np.array(list(reversed(self.ne))),
            raw_dtype=gguf.GGMLQuantizationType.F32)
 class OptimizationParamsV0:
    def __init__(self):
        pass
    def load(self, data, offset):
        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        return offset
 class OptimizationContext:
    def __init__(self):
        pass
    def load(self, data, offset):
        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
        offset += 4
        if self.version == 0:
            params = OptimizationParamsV0()
            offset = params.load(data, offset)
            self.past = params.past
            self.lbfgs_m = params.lbfgs_m
            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
            self.type = params.type
            self.adam_m  = Tensor('f', [self.nx])
            self.adam_v  = Tensor('f', [self.nx])
            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
            self.lbfgs_x    = Tensor('f', [self.nx])
            self.lbfgs_xp   = Tensor('f', [self.nx])
            self.lbfgs_g    = Tensor('f', [self.nx])
            self.lbfgs_gp   = Tensor('f', [self.nx])
            self.lbfgs_d    = Tensor('f', [self.nx])
            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
            if self.type == 0:
                # these tensors are stored, but we don't need their data
                x  = Tensor('f', [self.nx])
                g  = Tensor('f', [self.nx])
                g2 = Tensor('f', [self.nx])
                mh = Tensor('f', [self.nx])
                vh = Tensor('f', [self.nx])
                offset = x.load(data, offset)
                offset = g.load(data, offset)
                offset = g2.load(data, offset)
                offset = self.adam_m.load(data, offset)
                offset = self.adam_v.load(data, offset)
                offset = mh.load(data, offset)
                offset = vh.load(data, offset)
                offset = self.adam_pf.load(data, offset)
                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            elif self.type == 1:
                offset = self.lbfgs_x.load(data, offset)
                offset = self.lbfgs_xp.load(data, offset)
                offset = self.lbfgs_g.load(data, offset)
                offset = self.lbfgs_gp.load(data, offset)
                offset = self.lbfgs_d.load(data, offset)
                offset = self.lbfgs_pf.load(data, offset)
                offset = self.lbfgs_lmal.load(data, offset)
                offset = self.lbfgs_lmys.load(data, offset)
                offset = self.lbfgs_lms.load(data, offset)
                offset = self.lbfgs_lmy.load(data, offset)
                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            else:
                raise ValueError('Unknown optimizer type')
        elif self.version == 1:
            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
            self.adam_m  = Tensor('f', [self.nx])
            self.adam_v  = Tensor('f', [self.nx])
            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
            self.lbfgs_x    = Tensor('f', [self.nx])
            self.lbfgs_xp   = Tensor('f', [self.nx])
            self.lbfgs_g    = Tensor('f', [self.nx])
            self.lbfgs_gp   = Tensor('f', [self.nx])
            self.lbfgs_d    = Tensor('f', [self.nx])
            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
            # forgot to save type in version 1:
            # guess self.type from number of remaining bytes
            size_type_0 = 12 + sum([t.max_storage_size() for t in
                                    [self.adam_m, self.adam_v]
                                    +([self.adam_pf] if (self.past > 0) else [])])
            size_type_1 = 24 + sum([t.max_storage_size() for t in
                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
                                     self.lbfgs_lmal, self.lbfgs_lmys,
                                     self.lbfgs_lms, self.lbfgs_lmy]
                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
            # due to alignment padding the size might not by exact
            # but the difference in size for both types is significant,
            # so we can just use whichever is closest
            remaining = len(data) - offset
            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
                self.type = 0
            else:
                self.type = 1
            if self.type == 0:
                offset = self.adam_m.load(data, offset)
                offset = self.adam_v.load(data, offset)
                offset = self.adam_pf.load(data,offset)
                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            elif self.type == 1:
                offset = self.lbfgs_x.load(data, offset)
                offset = self.lbfgs_xp.load(data, offset)
                offset = self.lbfgs_g.load(data, offset)
                offset = self.lbfgs_gp.load(data, offset)
                offset = self.lbfgs_d.load(data, offset)
                offset = self.lbfgs_pf.load(data, offset)
                offset = self.lbfgs_lmal.load(data, offset)
                offset = self.lbfgs_lmys.load(data, offset)
                offset = self.lbfgs_lms.load(data, offset)
                offset = self.lbfgs_lmy.load(data, offset)
                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        else:
            raise ValueError('Invalid version of checkpoint file')
        return offset
    def save_gguf(self, gguf_writer):
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
        if self.type == 0:
            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
            if self.past > 0:
                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
        elif self.type == 1:
            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
            if self.past > 0:
                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
        else:
            raise ValueError('Unknown optimizer type')
 class ModelParams:
    def __init__(self):
        pass
    def load(self, data, offset):
        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        return offset
    def get_n_ff(self):
        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
    def save_gguf(self, gguf_writer):
        # self.n_vocab not saved
        gguf_writer.add_embedding_length(self.n_embd)
        gguf_writer.add_head_count(self.n_head)
        gguf_writer.add_block_count(self.n_layer)
        gguf_writer.add_rope_dimension_count(self.n_rot)
        gguf_writer.add_feed_forward_length(self.get_n_ff())
 def tensor_name(key, bid=None):
    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
 class Layer:
    def __init__(self, params, bid):
        self.bid = bid
        self.att_norm = Tensor('f', [params.n_embd])
        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
        self.ffn_norm = Tensor('f', [params.n_embd])
        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
    def load(self, data, offset):
        offset = self.att_norm.load(data, offset)
        offset = self.wq.load(data, offset)
        offset = self.wk.load(data, offset)
        offset = self.wv.load(data, offset)
        offset = self.wo.load(data, offset)
        offset = self.ffn_norm.load(data, offset)
        offset = self.w1.load(data, offset)
        offset = self.w2.load(data, offset)
        offset = self.w3.load(data, offset)
        return offset
    def save_gguf(self, gguf_writer):
        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
 class Model:
    def __init__(self):
        self.params = ModelParams()
        self.layers = []
    def load(self, data, offset):
        offset = self.params.load(data, offset)
        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
        self.norm     = Tensor('f', [self.params.n_embd])
        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
        offset = self.tok_embd.load(data, offset)
        offset = self.norm.load(data, offset)
        offset = self.output.load(data, offset)
        self.layers.clear()
        for bid in range(self.params.n_layer):
            layer = Layer(self.params, bid)
            offset = layer.load(data, offset)
            self.layers.append(layer)
        return offset
    def save_gguf(self, gguf_writer):
        self.params.save_gguf(gguf_writer)
        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
        for layer in self.layers:
            layer.save_gguf(gguf_writer)
 class Checkpoint:
    def __init__(self):
        self.model = Model()
        self.opt_ctx = OptimizationContext()
    def load(self, data, offset):
        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
        if magic != b'ggcp':
            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        if self.version != 0:
            raise ValueError('Invalid version of checkpoint file')
        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        offset = self.model.load(data, offset)
        offset = self.opt_ctx.load(data, offset)
        return offset
    def save_gguf(self, gguf_writer):
        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
        gguf_writer.add_layer_norm_rms_eps(1e-5)
        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
        self.model.save_gguf(gguf_writer)
        self.opt_ctx.save_gguf(gguf_writer)
 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
    return parser.parse_args()
 def main():
    cfg = handle_args()
    data = np.memmap(cfg.input, mode = 'r')
    chk = Checkpoint()
    offset = 0
    offset = chk.load(data, offset)
    # we should have read all available data
    assert(offset == len(data))
    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
    chk.save_gguf(gguf_writer)
    print("    gguf: write header")
    gguf_writer.write_header_to_file()
    print("    gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
    print("    gguf: write tensors")
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
 if __name__ == '__main__':
    main()
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1685518550,
+        "lastModified": 1692799911,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1685931219,
+        "lastModified": 1692913444,
-        "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
+        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
+        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -6,6 +6,9 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        name = "llama.cpp";
        src = ./.;
        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
        osSpecific = with pkgs; buildInputs ++
@ -21,11 +24,17 @@
              CoreGraphics
              CoreVideo
            ]
          else if isDarwin then
            with pkgs.darwin.apple_sdk.frameworks; [
              Accelerate
              CoreGraphics
              CoreVideo
            ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+        nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@ -38,35 +47,35 @@
          mv $out/bin/server $out/bin/llama-server
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in {
+      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
+          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
-              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-              "-DLLAMA_METAL=ON"
+            "-DLLAMA_METAL=ON"
-            ] else [
+          ] else [
-              "-DLLAMA_BLAS=ON"
+            "-DLLAMA_BLAS=ON"
-              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
          postInstall = postInstall;
          meta.mainProgram = "llama";
        };
        packages.opencl = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
+          inherit name src meta postPatch nativeBuildInputs postInstall;
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = with pkgs; buildInputs ++ [ clblast ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_CLBLAST=ON"
          ];
-          postInstall = postInstall;
+        };
-          meta.mainProgram = "llama";
+        packages.rocm = pkgs.stdenv.mkDerivation {
          inherit name src meta postPatch nativeBuildInputs postInstall;
          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_HIPBLAS=1"
            "-DCMAKE_C_COMPILER=hipcc"
            "-DCMAKE_CXX_COMPILER=hipcc"
            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
          ];
        };
        apps.llama-server = {
          type = "app";
@ -80,8 +89,13 @@
          type = "app";
          program = "${self.packages.${system}.default}/bin/llama";
        };
        apps.quantize = {
          type = "app";
          program = "${self.packages.${system}.default}/bin/quantize";
        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
      });
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -1,3 +1,8 @@
 // defines MAP_ANONYMOUS
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "ggml-alloc.h"
 #include "ggml.h"
 #include <assert.h>
@ -6,8 +11,29 @@
 #include <stdlib.h>
 #include <string.h>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/types.h>
            #include <sys/mman.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <memoryapi.h>
 #endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
 //#define GGML_ALLOCATOR_DEBUG
@ -67,8 +93,8 @@ struct ggml_allocr {
    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
    size_t max_size;
    bool measure;
-    int parse_seq[GGML_MAX_NODES];
+    int parse_seq[GGML_MAX_CONCUR];
-    bool has_parse_seq;
+    int parse_seq_len;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
@ -76,7 +102,7 @@ struct ggml_allocr {
 };
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i] == NULL) {
            alloc->allocated_tensors[i] = tensor;
@ -85,7 +111,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
    }
    GGML_ASSERT(!"out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i] == tensor ||
            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@ -98,15 +124,24 @@ static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_t
 }
 #endif
-
+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);
    UNUSED(alloc);
 }
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+#ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
 #endif
    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -172,17 +207,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }
 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
-    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+    if (ggml_allocr_is_own(alloc, tensor) == false) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        return;
    }
-    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
@ -238,15 +273,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
    alloc->n_free_blocks++;
 }
-void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
    int pos = 0;
    for (int i = 0; i < n; i++) {
-        if (list[i] != -1) {
+        alloc->parse_seq[i] = list[i];
            alloc->parse_seq[pos] = list[i];
            pos++;
        }
    }
-    alloc->has_parse_seq = true;
+    alloc->parse_seq_len = n;
 }
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
@ -269,9 +300,9 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
        /*.max_size      = */ 0,
        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ = {0},
+        /*.allocated_tensors = */ {0},
 #endif
    };
@ -280,17 +311,64 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }
-// address and size of the buffer when measuring
+// OS specific functions to allocate and free uncommitted virtual memory
-// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * alloc_vmem(size_t size) {
-static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+#if defined(_WIN32)
-static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
 #elif defined(_POSIX_MAPPED_FILES)
    return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
 #else
    // use a fixed address for other platforms
    uintptr_t base_addr = (uintptr_t)-size - 0x100;
    return (void *)base_addr;
 #endif
 }
 static void free_vmem(void * base_addr, size_t size) {
 #if defined(_WIN32)
    VirtualFree(base_addr, 0, MEM_RELEASE);
    UNUSED(size);
 #elif defined(_POSIX_MAPPED_FILES)
    munmap(base_addr, size);
 #else
    // nothing to do
    UNUSED(base_addr);
    UNUSED(size);
 #endif
 }
 // allocate uncommitted virtual memory to measure the size of the graph
 static void alloc_measure_vmem(void ** base_addr, size_t * size) {
    // 1TB for 64-bit, 1GB for 32-bit
    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
    do {
        *base_addr = alloc_vmem(*size);
        if (*base_addr != NULL) {
            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
            return;
        }
        // try again with half the size
        *size /= 2;
    } while (*size > 0);
    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
 }
 static void free_measure_vmem(void * base_addr, size_t size) {
    free_vmem(base_addr, size);
 }
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
    void * base_addr;
    size_t size;
    alloc_measure_vmem(&base_addr, &size);
    *alloc = (struct ggml_allocr){
-        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.data          = */ base_addr,
-        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.size          = */ size,
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@ -298,9 +376,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
        /*.max_size      = */ 0,
        /*.measure       = */ true,
        /*.parse_seq     = */ {0},
-        /*.has_parse_seq = */ false,
+        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ = {0},
+        /*.allocated_tensors = */ {0},
 #endif
    };
@ -310,6 +388,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
    if (alloc->measure) {
        free_measure_vmem(alloc->data, alloc->size);
    }
    free(alloc);
 }
@ -320,8 +401,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
 //////////// compute graph allocator
 static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+    return t->view_src != NULL;
           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
 }
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@ -339,28 +419,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
    return true;
 }
 static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
    switch (t->op) {
        case GGML_OP_PERMUTE:
        case GGML_OP_RESHAPE:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_VIEW:
            return t->src[0];
        case GGML_OP_CPY:
            return t->src[1];
        default:
            return NULL;
    }
 }
 static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
    struct ggml_tensor * parent = t;
    do {
        parent = get_view_parent(parent);
    } while (ggml_is_view(parent));
    return parent;
 }
 static bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
@ -368,7 +426,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_ACC:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@ -378,7 +435,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_UNARY:
        case GGML_OP_ROPE:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SET:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_CONT:
            return true;
@ -392,24 +448,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            size_t offset;
+            assert(node->view_src->data != NULL);
-            switch(node->op) {
+            node->data = (char *)node->view_src->data + node->view_offs;
                case GGML_OP_VIEW:
                    memcpy(&offset, node->op_params, sizeof(size_t));
                    node->data = (char *) node->src[0]->data + offset;
                    break;
                case GGML_OP_PERMUTE:
                case GGML_OP_RESHAPE:
                case GGML_OP_TRANSPOSE:
                    node->data = node->src[0]->data;
                    break;
                case GGML_OP_CPY:
                    node->data = node->src[1]->data;
                    break;
                default:
                    GGML_ASSERT(!"unknown view op");
                    break;
            }
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@ -420,8 +460,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    }
                    // if the node's data is external, then we cannot re-use it
-                    if ((char *) parent->data < (char *) alloc->data ||
+                    if (ggml_allocr_is_own(alloc, parent) == false) {
                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                        continue;
                    }
@ -429,7 +468,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    struct hash_node * p_hn = hash_get(ht, parent);
                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct ggml_tensor * view_src = parent->view_src;
                            struct hash_node * view_src_hn = hash_get(ht, view_src);
                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@ -445,8 +484,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                            node->data = parent->data;
                            return;
                        }
                        return;
                    }
                }
            }
@ -455,7 +494,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }
-static size_t ggml_allocator_alloc_graph_tensors_n(
+static size_t ggml_allocr_alloc_graph_tensors_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -471,7 +510,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
            struct ggml_tensor * node = gf->nodes[i];
            if (ggml_is_view(node)) {
-                struct ggml_tensor * view_src = get_view_source(node);
+                struct ggml_tensor * view_src = node->view_src;
                hash_get(ht, view_src)->n_views += 1;
            }
@ -497,76 +536,92 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                allocate_node(alloc, input);
            }
        }
-        for (int ind = 0; ind < gf->n_nodes; ind++) {
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-            int i;
+        int last_barrier_pos = 0;
-            if (alloc->has_parse_seq) {
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
                i = alloc->parse_seq[ind];
            } else {
                i = ind;
            }
            struct ggml_tensor * node = gf->nodes[i];
-            // allocate parents (leafs)
+        for (int ind = 0; ind < n_nodes; ind++) {
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
-                struct ggml_tensor * parent = node->src[j];
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
-                if (parent == NULL) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
-                    break;
+                struct ggml_tensor * node = gf->nodes[i];
                }
                allocate_node(alloc, parent);
            }
-            // allocate node
+                // allocate parents (leafs)
-            allocate_node(alloc, node);
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    struct ggml_tensor * parent = node->src[j];
                    if (parent == NULL) {
                        break;
                    }
                    allocate_node(alloc, parent);
                }
-            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+                // allocate node
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                allocate_node(alloc, node);
-                struct ggml_tensor * parent = node->src[j];
+
-                if (parent == NULL) {
+                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
-                    break;
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                }
+                    struct ggml_tensor * parent = node->src[j];
-                AT_PRINTF("%s", parent->name);
+                    if (parent == NULL) {
-                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                        break;
-                    AT_PRINTF(", ");
+                    }
                    AT_PRINTF("%s", parent->name);
                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
                        AT_PRINTF(", ");
                    }
                }
                AT_PRINTF("\n");
            }
            AT_PRINTF("\n");
            // update parents
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
+            // update immediately if there is no parse_seq
-                struct ggml_tensor * parent = node->src[j];
+            // update only at barriers if there is parse_seq
-                if (parent == NULL) {
+            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
-                    break;
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                for (int i = update_start; i < update_end; i++) {
                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
                    struct ggml_tensor * node = gf->nodes[node_i];
                    for (int j = 0; j < GGML_MAX_SRC; j++) {
                        struct ggml_tensor * parent = node->src[j];
                        if (parent == NULL) {
                            break;
                        }
                        struct hash_node * p_hn = hash_get(ht, parent);
                        p_hn->n_children -= 1;
                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                            if (ggml_is_view(parent)) {
                                struct ggml_tensor * view_src = parent->view_src;
                                struct hash_node * view_src_hn = hash_get(ht, view_src);
                                view_src_hn->n_views -= 1;
                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
                                    ggml_allocr_free_tensor(alloc, view_src);
                                }
                            }
                            else {
                                if (parent->data != node->data) {
                                    ggml_allocr_free_tensor(alloc, parent);
                                }
                            }
                        }
                    }
                }
-                struct hash_node * p_hn = hash_get(ht, parent);
+                AT_PRINTF("\n");
-                p_hn->n_children -= 1;
+                if (alloc->parse_seq_len) {
-
+                    last_barrier_pos = ind + 1;
                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                    if (ggml_is_view(parent)) {
                        struct ggml_tensor * view_src = get_view_source(parent);
                        struct hash_node * view_src_hn = hash_get(ht, view_src);
                        view_src_hn->n_views -= 1;
                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
                            ggml_allocator_free_tensor(alloc, view_src);
                        }
                    }
                    else {
                        if (parent->data != node->data) {
                            ggml_allocator_free_tensor(alloc, parent);
                        }
                    }
                }
            }
            AT_PRINTF("\n");
        }
        // free graph outputs here that wouldn't be freed otherwise because they have no children
        if (outputs != NULL && outputs[g] != NULL) {
            for (int i = 0; outputs[g][i] != NULL; i++) {
                struct ggml_tensor * output = outputs[g][i];
                AT_PRINTF("output: %s\n", output->name);
-                ggml_allocator_free_tensor(alloc, output);
+                ggml_allocr_free_tensor(alloc, output);
            }
        }
    }
@ -575,5 +630,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
 GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6,15 +6,133 @@
 #include <atomic>
 #include <assert.h>
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #ifdef __HIP_PLATFORM_AMD__
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasHandle_t hipblasHandle_t
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif
 #include "ggml-cuda.h"
 #include "ggml.h"
 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #ifndef CC_TURING
 #define CC_TURING   700
 #endif
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 #ifndef __has_builtin
    #define __has_builtin(x) 0
 #endif
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 #if __has_builtin(__builtin_elementwise_sub_sat)
    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
    return reinterpret_cast<const int&>(c);
 #else
    int8x4_t c;
    int16_t tmp;
 #pragma unroll
    for (int i = 0; i < 4; i++) {
        tmp = va[i] - vb[i];
        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
        c[i] = tmp;
    }
    return reinterpret_cast<int&>(c);
 #endif // __has_builtin(__builtin_elementwise_sub_sat)
 }
 static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(__gfx1100__)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 #elif defined(__gfx1010__) || defined(__gfx900__)
    int tmp1;
    int tmp2;
    asm("\n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
        v_add3_u32 %0, %1, %2, %0 \n \
        "
        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
        : "v"(a), "v"(b)
    );
 #else
    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
 #endif
    return c;
 }
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -205,11 +323,11 @@ typedef struct {
 #define QI4_K (QK_K / (4*QR4_K))
 #ifdef GGML_QKK_64
 typedef struct {
-    half    d[2];              // super-block scales/mins
+    half    dm[2];             // super-block scales/mins
    uint8_t scales[2];         // 4-bit block scales/mins
    uint8_t qs[QK_K/2];        // 4--bit quants
 } block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
 #else
 typedef struct {
    half2 dm;                  // super-block scale for quantized scales/mins
@ -259,6 +377,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define CUDA_CPY_BLOCK_SIZE 32
 #define CUDA_SCALE_BLOCK_SIZE 256
 #define CUDA_ROPE_BLOCK_SIZE 256
 #define CUDA_ALIBI_BLOCK_SIZE 32
 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
@ -286,7 +405,7 @@ static int g_device_count = -1;
 static int g_main_device = 0;
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = false;
+static bool g_mul_mat_q = true;
 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
@ -345,58 +464,91 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
    dst[i] = x[i] / (1.0f + expf(-x[i]));
 }
 static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
    }
    return a;
 }
 template <int block_size>
 static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
    const float eps = 1e-5f;
-    float mean = 0.0f;
+    float2 mean_var = make_float2(0.f, 0.f);
    float var = 0.0f;
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
-        mean += xi;
+        mean_var.x += xi;
-        var += xi * xi;
+        mean_var.y += xi * xi;
    }
    // sum up partial sums
-#pragma unroll
+    mean_var = warp_reduce_sum(mean_var);
-    for (int mask = 16; mask > 0; mask >>= 1) {
+    if (block_size > WARP_SIZE) {
-        mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
+        __shared__ float2 s_sum[32];
-        var += __shfl_xor_sync(0xffffffff, var, mask, 32);
+        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = mean_var;
        }
        __syncthreads();
        mean_var = s_sum[lane_id];
        mean_var = warp_reduce_sum(mean_var);
    }
-    mean /= ncols;
+    const float mean = mean_var.x / ncols;
-    var = var / ncols - mean * mean;
+    const float var = mean_var.y / ncols - mean * mean;
-    const float inv_var = rsqrtf(var + eps);
+    const float inv_std = rsqrtf(var + eps);
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
    }
 }
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
    }
    return x;
 }
 template <int block_size>
 static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
    float tmp = 0.0f; // partial sum for thread in warp
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        const float xi = x[row*ncols + col];
        tmp += xi * xi;
    }
    // sum up partial sums
-#pragma unroll
+    tmp = warp_reduce_sum(tmp);
-    for (int mask = 16; mask > 0; mask >>= 1) {
+    if (block_size > WARP_SIZE) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+        __shared__ float s_sum[32];
        int warp_id = threadIdx.x / WARP_SIZE;
        int lane_id = threadIdx.x % WARP_SIZE;
        if (lane_id == 0) {
            s_sum[warp_id] = tmp;
        }
        __syncthreads();
        tmp = s_sum[lane_id];
        tmp = warp_reduce_sum(tmp);
    }
    const float mean = tmp / ncols;
    const float scale = rsqrtf(mean + eps);
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
+    for (int col = tid; col < ncols; col += block_size) {
        dst[row*ncols + col] = scale * x[row*ncols + col];
    }
 }
@ -423,8 +575,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;
-    const dfloat d = x[ib].dm.x;
+    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = x[ib].dm.y;
+    const dfloat m = __high2half(x[ib].dm);
    const int vui = x[ib].qs[iqs];
@ -466,8 +618,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;
-    const dfloat d = x[ib].dm.x;
+    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = x[ib].dm.y;
+    const dfloat m = __high2half(x[ib].dm);
    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));
@ -519,8 +671,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const uint8_t q = x[i].qs[32*n + l];
    float * y = yy + i*QK_K + 128*n;
-    float dall = x[i].dm.x;
+    float dall = __low2half(x[i].dm);
-    float dmin = x[i].dm.y;
+    float dmin = __high2half(x[i].dm);
    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@ -530,8 +682,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
    const int il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    float * y = yy + i*QK_K + 16*is + il;
-    float dall = x[i].dm.x;
+    float dall = __low2half(x[i].dm);
-    float dmin = x[i].dm.y;
+    float dmin = __high2half(x[i].dm);
    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
 #endif
@ -617,8 +769,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
    float * y = yy + i*QK_K + 64*il + n*ir;
-    const float dall = x[i].dm.x;
+    const float dall = __low2half(x[i].dm);
-    const float dmin = x[i].dm.y;
+    const float dmin = __high2half(x[i].dm);
    const uint8_t * q = x[i].qs + 32*il + n*ir;
@ -635,8 +787,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
    const int tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    float * y = yy + i*QK_K;
-    const float d = (float)x[i].d[0];
+    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].d[1];
+    const float m = (float)x[i].dm[1];
    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
 #endif
@ -656,8 +808,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
    float * y = yy + i*QK_K + 64*il + 2*ir;
-    const float dall = x[i].dm.x;
+    const float dall = __low2half(x[i].dm);
-    const float dmin = x[i].dm.y;
+    const float dmin = __high2half(x[i].dm);
    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
    const uint8_t * qh = x[i].qh + 2*ir;
@ -769,8 +921,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
@ -990,8 +1142,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@ -1053,8 +1205,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].d[0];
+        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].d[1];
+        const float m = (float)x[i].dm[1];
        float sum = 0.f;
        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
@ -1123,8 +1275,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;
-        const float dall = x[i].dm.x;
+        const float dall = __low2half(x[i].dm);
-        const float dmin = x[i].dm.y;
+        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
@ -1347,8 +1499,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
        return;
    }
-    y[ib].ds.x = d;
+    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    y[ib].ds.y = sum;
+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@ -2345,7 +2497,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
    }
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
 }
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@ -2431,7 +2583,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR2_K; ++ i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }
    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@ -2550,7 +2702,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR3_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
    }
    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@ -2719,7 +2871,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2half(bq8i->ds);
        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@ -2743,11 +2895,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    aux16[0] = a[0] & 0x0f0f;
    aux16[1] = (a[0] >> 4) & 0x0f0f;
-    const float dall = bq4_K->d[0];
+    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->d[1];
+    const float dmin = bq4_K->dm[1];
-    const float d8_1 = bq8_1[0].ds.x;
+    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_2 = __low2float(bq8_1[1].ds);
    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -2827,7 +2979,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
 #if QK_K == 256
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
 #else
        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
 #endif
    }
 #pragma unroll
@ -2900,7 +3056,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR5_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds.x;
+        d8[i] = __low2float(bq8i->ds);
        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
        u[2*i+0] = q8[0];
@ -2918,8 +3074,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const float d = bq5_K->d;
-    const float d8_1 = bq8_1[0].ds.x;
+    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = bq8_1[1].ds.x;
+    const float d8_2 = __low2half(bq8_1[1].ds);
    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -3017,7 +3173,9 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
 #if QK_K == 256
        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
 #endif
    }
 #pragma unroll
@ -3074,7 +3232,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #pragma unroll
    for (int i = 0; i < QR6_K; ++i) {
        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
+        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
    }
    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@ -3242,7 +3400,7 @@ static __device__ __forceinline__ void mul_mat_q(
                    *dsi_dst = *dsi_src;
                } else {
                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src).x;
+                    *dfi_dst = __low2half(*dsi_src);
                }
            }
@ -3886,13 +4044,13 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 // rope == RoPE == rotary positional embedding
 static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
                                const float p_delta, const int p_delta_rows, const float theta_scale) {
-    const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
    if (col >= ncols) {
        return;
    }
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col;
    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
@ -3906,6 +4064,28 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
    dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
                                const float p_delta, const int p_delta_rows, const float theta_scale) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
    if (col >= ncols) {
        return;
    }
    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    const int i = row*ncols + col/2;
    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);
    const float x0 = x[i + 0];
    const float x1 = x[i + ncols/2];
    dst[i + 0]       = x0*cos_theta - x1*sin_theta;
    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }
 static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;
@ -3940,9 +4120,32 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
 }
-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
+static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
                                 const int n_heads_log2_floor, const float m0, const float m1) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    if (col >= ncols) {
        return;
    }
    const int row = blockDim.y*blockIdx.y + threadIdx.y;
    const int i = row*ncols + col;
    const int k = row/k_rows;
    float m_k;
    if (k < n_heads_log2_floor) {
        m_k = powf(m0, k + 1);
    } else {
        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
    }
    dst[i] = col * m_k + x[i];
 }
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
    const int col = blockDim.y*blockIdx.y + threadIdx.y;
    const int row = blockDim.x*blockIdx.x + threadIdx.x;
    if (col >= ncols) {
        return;
@ -3955,24 +4158,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
 // the CUDA soft max implementation differs from the CPU implementation
 // instead of doubles floats are used
 // values are also not normalized to the maximum value by subtracting it in the exponential function
 // theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine
 static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int block_size = blockDim.x;
+    const int block_size = blockDim.y;
-    const int tid = threadIdx.x;
+    const int tid = threadIdx.y;
-    float tmp = 0.0;
+    float max_val = -INFINITY;
    for (int block_start = 0; block_start < ncols; block_start += block_size) {
        const int col = block_start + tid;
        if (col >= ncols) {
            break;
        }
    for (int col = tid; col < ncols; col += block_size) {
        const int i = row*ncols + col;
-        const float val = expf(x[i]);
+        max_val = max(max_val, x[i]);
    }
    // find the max value in the block
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
        max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
    }
    float tmp = 0.f;
    for (int col = tid; col < ncols; col += block_size) {
        const int i = row*ncols + col;
        const float val = expf(x[i] - max_val);
        tmp += val;
        dst[i] = val;
    }
@ -3983,15 +4191,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
    }
-    for (int block_start = 0; block_start < ncols; block_start += block_size) {
+    const float inv_tmp = 1.f / tmp;
        const int col = block_start + tid;
        if (col >= ncols) {
            break;
        }
    for (int col = tid; col < ncols; col += block_size) {
        const int i = row*ncols + col;
-        dst[i] /= tmp;
+        dst[i] *= inv_tmp;
    }
 }
@ -4032,14 +4236,24 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
 static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    if (ncols < 1024) {
-    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+        const dim3 block_dims(WARP_SIZE, 1, 1);
        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
    } else {
        const dim3 block_dims(1024, 1, 1);
        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
    }
 }
 static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    if (ncols < 1024) {
-    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
+        const dim3 block_dims(WARP_SIZE, 1, 1);
        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    } else {
        const dim3 block_dims(1024, 1, 1);
        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
    }
 }
 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
@ -4561,6 +4775,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 #if QK_K == 256
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = g_compute_capabilities[id];
@ -4592,6 +4808,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
    }
 #endif
 }
 static void ggml_mul_mat_q4_K_q8_1_cuda(
@ -4751,13 +4968,22 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
 static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
-    GGML_ASSERT(nrows % 2 == 0);
+    GGML_ASSERT(ncols % 2 == 0);
-    const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(num_blocks_x, nrows, 1);
+    const dim3 block_nums(nrows, num_blocks_x, 1);
    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
 }
 static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
 }
 static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(nrows % 4 == 0);
    const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@ -4766,16 +4992,25 @@ static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, con
    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
 }
 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
                           const int k_rows, const int n_heads_log2_floor, const float m0,
                           const float m1, cudaStream_t stream) {
    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
    const dim3 block_nums(num_blocks_x, nrows, 1);
    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
 }
 static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
-    const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, nrows_x, 1);
+    const dim3 block_nums(nrows_x, block_num_x, 1);
    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
 }
 static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_dims(1, WARP_SIZE, 1);
-    const dim3 block_nums(1, nrows_x, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
    soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
 }
@ -4880,10 +5115,18 @@ void ggml_init_cublas() {
    static bool initialized = false;
    if (!initialized) {
 #ifdef __HIP_PLATFORM_AMD__
        // Workaround for a rocBLAS bug when using multiple graphics cards:
        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
        rocblas_initialize();
        CUDA_CHECK(cudaDeviceSynchronize());
 #endif
        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
-        fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@ -5481,7 +5724,8 @@ inline void ggml_cuda_op_rope(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const bool is_glm = mode & 4;
+    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
    // compute
    if (is_glm) {
@ -5489,6 +5733,10 @@ inline void ggml_cuda_op_rope(
        const float id_p = min(p, n_ctx - 2.f);
        const float block_p = max(p - (n_ctx - 2.f), 0.f);
        rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
    } else if (is_neox) {
        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
    } else {
        const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
        rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
@ -5501,6 +5749,41 @@ inline void ggml_cuda_op_rope(
    (void) i1;
 }
 inline void ggml_cuda_op_alibi(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
    cudaStream_t & cudaStream_main){
    GGML_ASSERT(src0_ddf_i != nullptr);
    GGML_ASSERT(dst_ddf_i != nullptr);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t i01_diff = i01_high - i01_low;
    const int n_past = ((int32_t *) dst->op_params)[0];
    const int n_head = ((int32_t *) dst->op_params)[1];
    float max_bias;
    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
    GGML_ASSERT(ne01 + n_past == ne00);
    GGML_ASSERT(n_head == ne02);
    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
    // compute
    alibi_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_heads_log2_floor, m0, m1, cudaStream_main);
    (void) src1;
    (void) src0_ddq_i;
    (void) src1_ddf_i;
    (void) i1;
 }
 inline void ggml_cuda_op_diag_mask_inf(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
    float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@ -6115,12 +6398,19 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml
 void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
    const int mode = ((int32_t *) dst->op_params)[2];
    const bool is_glm = mode & 4;
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
 }
 void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_alibi, true, true);
 }
 void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    (void) src0;
    (void) src1;
@ -6240,7 +6530,7 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
    return extra;
 }
-void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
+void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
    if (scratch && g_scratch_size == 0) {
        return;
    }
@ -6249,14 +6539,19 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src[0]->op;
        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
        }
    }
    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
    }
    tensor->backend = GGML_BACKEND_GPU;
    if (scratch && no_alloc) {
        return;
    }
    struct ggml_tensor_extra_gpu * extra;
    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
@ -6308,16 +6603,48 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
    tensor->extra = extra;
 }
 void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
    if (g_scratch_size == 0) {
        return;
    }
    if (g_scratch_buffer == nullptr) {
        CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
    }
    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
        tensor->op == GGML_OP_VIEW;
    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
        struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
        size_t view_offset = 0;
        if (tensor->op == GGML_OP_VIEW) {
            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
        }
        extra->data_device[g_main_device] = src0_ddc + view_offset;
    } else {
        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
    }
    tensor->extra = extra;
 }
 void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false);
+    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
 }
 void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
 }
 void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, false);
+    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
 }
 void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, true);
+    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
 }
 void ggml_cuda_set_main_device(int main_device) {
@ -6456,6 +6783,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_rope;
            break;
        case GGML_OP_ALIBI:
            if (!any_on_device) {
                return false;
            }
            func = ggml_cuda_alibi;
            break;
        default:
            return false;
    }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -2,6 +2,14 @@
 #include "ggml.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -16,9 +24,14 @@ GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const str
 GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
 GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
 GGML_API void   ggml_cuda_set_main_device(int main_device);
 GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
 GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -24,6 +24,7 @@
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
 #define GGML_METAL_MAX_COMMAND_BUFFERS 32
 struct ggml_tensor;
 struct ggml_cgraph;
@ -38,6 +39,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 void * ggml_metal_host_malloc(size_t n);
 void   ggml_metal_host_free  (void * data);
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -11,6 +11,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // TODO: temporary - reuse llama.cpp logging
 #ifdef GGML_METAL_NDEBUG
 #define metal_printf(...)
 #else
@ -33,12 +34,15 @@ struct ggml_metal_buffer {
 struct ggml_metal_context {
    int n_cb;
    float * logits;
    id<MTLDevice>       device;
    id<MTLCommandQueue> queue;
    id<MTLLibrary>      library;
    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
    dispatch_queue_t d_queue;
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
@ -63,6 +67,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
@ -71,8 +76,10 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
@ -81,6 +88,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
@ -107,16 +115,31 @@ static NSString * const msl_library_source = @"see metal.metal";
@end
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    fprintf(stderr, "%s: allocating\n", __func__);
+    metal_printf("%s: allocating\n", __func__);
    // Show all the Metal device instances in the system
    NSArray * devices = MTLCopyAllDevices();
    id <MTLDevice> device;
    NSString * s;
    for (device in devices) {
        s = [device name];
        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
    }
    // Pick and show default Metal device
    device = MTLCreateSystemDefaultDevice();
    s = [device name];
    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-
+    ctx->device = device;
-    ctx->n_cb   = n_cb;
+    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;
    ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
 #if 0
    // compile from source string and show compile log
@ -125,7 +148,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -139,11 +162,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
        NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
+        metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
@ -155,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 #endif
        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -167,9 +190,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
+        metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
-            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }
@ -186,6 +211,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
@ -194,8 +220,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
@ -203,6 +231,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
@ -218,27 +247,101 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }
-    fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-    fprintf(stderr, "%s: hasUnifiedMemory             = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    metal_printf("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    if (ctx->device.maxTransferRate != 0) {
-        fprintf(stderr, "%s: maxTransferRate              = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        metal_printf("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
    } else {
-        fprintf(stderr, "%s: maxTransferRate              = built-in GPU\n", __func__);
+        metal_printf("%s: maxTransferRate               = built-in GPU\n", __func__);
    }
    return ctx;
 }
 void ggml_metal_free(struct ggml_metal_context * ctx) {
-    fprintf(stderr, "%s: deallocating\n", __func__);
+    metal_printf("%s: deallocating\n", __func__);
 #define GGML_METAL_DEL_KERNEL(name) \
    [ctx->function_##name release]; \
    [ctx->pipeline_##name release];
    GGML_METAL_DEL_KERNEL(add);
    GGML_METAL_DEL_KERNEL(add_row);
    GGML_METAL_DEL_KERNEL(mul);
    GGML_METAL_DEL_KERNEL(mul_row);
    GGML_METAL_DEL_KERNEL(scale);
    GGML_METAL_DEL_KERNEL(silu);
    GGML_METAL_DEL_KERNEL(relu);
    GGML_METAL_DEL_KERNEL(gelu);
    GGML_METAL_DEL_KERNEL(soft_max);
    GGML_METAL_DEL_KERNEL(diag_mask_inf);
    GGML_METAL_DEL_KERNEL(get_rows_f16);
    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DEL_KERNEL(rope);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
 #undef GGML_METAL_DEL_KERNEL
    for (int i = 0; i < ctx->n_buffers; ++i) {
        [ctx->buffers[i].metal release];
    }
    [ctx->library release];
    [ctx->queue release];
    [ctx->device release];
    dispatch_release(ctx->d_queue);
    free(ctx);
 }
 void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
    const int result = posix_memalign((void **) &data, getpagesize(), n);
    if (result != 0) {
        metal_printf("%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }
    return data;
 }
 void ggml_metal_host_free(void * data) {
    free(data);
 }
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = n_cb;
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }
 int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
@ -254,7 +357,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
 // Metal buffer based on the host memory pointer
 //
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    const int64_t tsize = ggml_nbytes(t);
@ -265,13 +368,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;
-            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
            return ctx->buffers[i].metal;
        }
    }
-    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+    metal_printf("%s: error: buffer is nil\n", __func__);
    return nil;
 }
@ -283,7 +386,7 @@ bool ggml_metal_add_buffer(
                         size_t   size,
                         size_t   max_size) {
    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        fprintf(stderr, "%s: too many buffers\n", __func__);
+        metal_printf("%s: too many buffers\n", __func__);
        return false;
    }
@ -293,7 +396,7 @@ bool ggml_metal_add_buffer(
            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
                return false;
            }
        }
@ -314,11 +417,11 @@ bool ggml_metal_add_buffer(
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                return false;
            }
-            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
            ++ctx->n_buffers;
        } else {
@ -338,27 +441,27 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                    return false;
                }
-                fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                if (i + size_step < size) {
-                    fprintf(stderr, "\n");
+                    metal_printf("\n");
                }
                ++ctx->n_buffers;
            }
        }
-        fprintf(stderr, ", (%8.2f / %8.2f)",
+        metal_printf(", (%8.2f / %8.2f)",
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
+            metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
        } else {
-            fprintf(stderr, "\n");
+            metal_printf("\n");
        }
    }
@ -368,8 +471,6 @@ bool ggml_metal_add_buffer(
 void ggml_metal_set_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
    size_t offs;
    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
@ -379,8 +480,6 @@ void ggml_metal_set_tensor(
 void ggml_metal_get_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
    size_t offs;
    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
@ -475,14 +574,14 @@ void ggml_metal_graph_find_concurrency(
    }
    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
+        metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }
 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
-    metal_printf("%s: evaluating graph\n", __func__);
+    @autoreleasepool {
    // if there is ctx->concur_list, dispatch concurrently
    // else fallback to serial dispatch
@ -498,32 +597,28 @@ void ggml_metal_graph_compute(
    const int n_cb = ctx->n_cb;
    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
    for (int i = 0; i < n_cb; ++i) {
-        command_buffers[i] = [ctx->queue commandBuffer];
+        ctx->command_buffers[i] = [ctx->queue commandBuffer];
        // enqueue the command buffers in order to specify their execution order
-        [command_buffers[i] enqueue];
+        [ctx->command_buffers[i] enqueue];
    }
-    // TODO: is this the best way to start threads?
+        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
-    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
+    }
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
-        dispatch_async(queue, ^{
+        dispatch_async(ctx->d_queue, ^{
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-
+            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
            for (int ind = node_start; ind < node_end; ++ind) {
                const int i = has_concur ? ctx->concur_list[ind] : ind;
@ -533,7 +628,7 @@ void ggml_metal_graph_compute(
                    continue;
                }
-                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@ -602,6 +697,12 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            // utilize float4
                            GGML_ASSERT(ne00 % 4 == 0);
                            const int64_t nb = ne00/4;
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
@ -611,14 +712,20 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_MUL:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            // utilize float4
                            GGML_ASSERT(ne00 % 4 == 0);
                            const int64_t nb = ne00/4;
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@ -628,9 +735,9 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
@ -681,7 +788,7 @@ void ggml_metal_graph_compute(
                                } break;
                            default:
                                {
-                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                                    GGML_ASSERT(false);
                                }
                        } break;
@ -729,32 +836,32 @@ void ggml_metal_graph_compute(
                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                ne00%32 == 0 &&
                                ne11 > 1) {
-                                    switch (src0->type) {
+                                switch (src0->type) {
-                                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
+                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
-                                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
-                                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
-                                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
-                                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
-                                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
-                                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
-                                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
-                                        default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
-                                    }
+                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
                                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
                                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
                                    [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
                                    [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
                                    [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
                                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                                }
-                            else {
+                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:8];
                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:9];
                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:10];
                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
@ -762,9 +869,13 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            nth0 = 64;
+                                            nth0 = 32;
                                            nth1 = 1;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                            if (ne11 * ne12 < 4) {
                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
                                            } else {
                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                            }
                                        } break;
                                    case GGML_TYPE_Q4_0:
                                        {
@ -784,6 +895,15 @@ void ggml_metal_graph_compute(
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
                                        } break;
                                    case GGML_TYPE_Q2_K:
                                        {
                                            GGML_ASSERT(ne02 == 1);
@ -807,8 +927,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 2;
+                                            nth0 = 4; //1;
-                                            nth1 = 32;
+                                            nth1 = 8; //32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@ -831,7 +951,7 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            fprintf(stderr, "Asserting on type %d\n",(int)src0t);
+                                            metal_printf("Asserting on type %d\n",(int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };
@ -853,36 +973,40 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                                [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
+                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                }
                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
-                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    int64_t ny = (ne11 + 3)/4;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
                            switch (src0->type) {
-                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
+                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
@ -923,16 +1047,17 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_NORM:
                        {
-                            const float eps = 1e-5f;
+                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));
                            const int nth = 256;
                            [encoder setComputePipelineState:ctx->pipeline_norm];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
@ -975,7 +1100,9 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
                            [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];
                            const int nth = 32;
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ROPE:
@ -990,8 +1117,8 @@ void ggml_metal_graph_compute(
                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
                            [encoder setComputePipelineState:ctx->pipeline_rope];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
@ -1042,30 +1169,30 @@ void ggml_metal_graph_compute(
                                default: GGML_ASSERT(false && "not implemented");
                            }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    default:
                        {
-                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                            GGML_ASSERT(false);
                        }
                }
@ -1081,17 +1208,19 @@ void ggml_metal_graph_compute(
    }
    // wait for all threads to finish
-    dispatch_barrier_sync(queue, ^{});
+    dispatch_barrier_sync(ctx->d_queue, ^{});
    [command_buffers[n_cb - 1] waitUntilCompleted];
    // check status of command buffers
    // needed to detect if the device ran out-of-memory for example (#1881)
    for (int i = 0; i < n_cb; i++) {
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+        [ctx->command_buffers[i] waitUntilCompleted];
        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
-            fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            GGML_ASSERT(false);
        }
    }
    }
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -18,10 +18,16 @@ typedef struct {
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 #define QK8_0 32
 typedef struct {
    half    d;         // delta
    int8_t  qs[QK8_0]; // quants
 } block_q8_0;
 kernel void kernel_add(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] + src1[tpig];
 }
@ -29,18 +35,18 @@ kernel void kernel_add(
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_add_row(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
-        constant   int64_t & ne00,
+        constant   int64_t & nb,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig % ne00];
+    dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 kernel void kernel_mul(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * src1[tpig];
 }
@ -48,12 +54,12 @@ kernel void kernel_mul(
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_mul_row(
-        device const float * src0,
+        device const float4 * src0,
-        device const float * src1,
+        device const float4 * src1,
-        device       float * dst,
+        device       float4 * dst,
-        constant   int64_t & ne00,
+        constant    int64_t & nb,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % ne00];
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
 }
 kernel void kernel_scale(
@ -87,7 +93,12 @@ kernel void kernel_gelu(
    device       float * dst,
    uint tpig[[thread_position_in_grid]]) {
    float x = src0[tpig];
-    dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+
    // BEWARE !!!
    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
    // This was observed with Falcon 7B and 40B models
    //
    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }
 kernel void kernel_soft_max(
@ -122,19 +133,24 @@ kernel void kernel_soft_max(
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast - not needed. There is a threadgroup barrier above in the last iteration of
-    if (tpitg[0] == 0) {
+    //               the loop, and when that is done, buf[0] has the correct (synchronized) value
-        buf[0] = buf[0];
+    //if (tpitg[0] == 0) {
-    }
+    //    buf[0] = buf[0];
    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float max = buf[0];
    // parallel sum
    buf[tpitg[0]] = 0.0f;
    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        buf[tpitg[0]] += exp(psrc0[i00] - max);
+        const float exp_psrc0 = exp(psrc0[i00] - max);
        buf[tpitg[0]] += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
        // whish to compute it twice.
        pdst[i00] = exp_psrc0;
    }
    // reduce
@ -146,17 +162,18 @@ kernel void kernel_soft_max(
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    // broadcast - not needed, see above
-    if (tpitg[0] == 0) {
+    //// broadcast
-        buf[0] = buf[0];
+    //if (tpitg[0] == 0) {
-    }
+    //    buf[0] = buf[0];
    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float sum = buf[0];
    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
-        pdst[i00] = exp(psrc0[i00] - max) / sum;
+        pdst[i00] /= sum;
    }
 }
@ -203,25 +220,27 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast
-    if (tpitg == 0) {
+    //if (tpitg == 0) {
-        sum[0] /= ne00;
+    //    sum[0] /= ne00;
-    }
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float mean  = sum[0];
-    // recenter
+    // recenter and VARIANCE
    device float * y = dst + tgpig*ne00;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        y[i00] = x[i00] - mean;
    }
    // VARIANCE
    // parallel sum
    sum[tpitg] = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        y[i00] = x[i00] - mean;
        sum[tpitg] += y[i00] * y[i00];
    }
    //// VARIANCE
    //// parallel sum
    //sum[tpitg] = 0.0f;
    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
    //    sum[tpitg] += y[i00] * y[i00];
    //}
    // reduce
    threadgroup_barrier(mem_flags::mem_threadgroup);
    for (uint i = ntg/2; i > 0; i /= 2) {
@ -230,11 +249,11 @@ kernel void kernel_norm(
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
-    // broadcast
+    //// broadcast
-    if (tpitg == 0) {
+    //if (tpitg == 0) {
-        sum[0] /= ne00;
+    //    sum[0] /= ne00;
-    }
+    //}
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+    //threadgroup_barrier(mem_flags::mem_threadgroup);
    const float variance = sum[0];
    const float scale = 1.0f/sqrt(variance + eps);
@ -352,7 +371,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
    const int first_row = (r0 * nsg + sgitg) * nr;
    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
    float yl[16];       // src1 vector cache
    float sumf[nr]={0.f};
@ -424,6 +443,124 @@ kernel void kernel_mul_mat_q4_1_f32(
     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
 #define NB_Q8_0 8
 kernel void kernel_mul_mat_q8_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01[[buffer(4)]],
        constant   int64_t & ne02[[buffer(5)]],
        constant   int64_t & ne10[[buffer(9)]],
        constant   int64_t & ne12[[buffer(11)]],
        constant   int64_t & ne0[[buffer(15)]],
        constant   int64_t & ne1[[buffer(16)]],
        constant   uint    & gqa[[buffer(17)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    const int nr  = N_DST;
    const int nsg = N_SIMDGROUP;
    const int nw  = N_SIMDWIDTH;
    const int nb = ne00/QK8_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;
    const int first_row = (r0 * nsg + sgitg) * nr;
    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
    float yl[NB_Q8_0];
    float sumf[nr]={0.f};
    const int ix = tiisg/4;
    const int il = tiisg%4;
    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
    for (int ib = ix; ib < nb; ib += nw/4) {
        for (int i = 0; i < NB_Q8_0; ++i) {
            yl[i] = yb[i];
        }
        for (int row = 0; row < nr; row++) {
            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
            float sumq = 0.f;
            for (int iq = 0; iq < NB_Q8_0; ++iq) {
                sumq += qs[iq] * yl[iq];
            }
            sumf[row] += sumq*x[ib+row*nb].d;
        }
        yb += NB_Q8_0 * nw;
    }
    for (int row = 0; row < nr; ++row) {
        const float tot = simd_sum(sumf[row]);
        if (tiisg == 0 && first_row + row < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
        }
    }
 }
 kernel void kernel_mul_mat_f16_f32_1row(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]]) {
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
    const int64_t im = tgpig.z;
    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
    float sumf = 0;
    if (ne00 < 128) {
        for (int i = tiisg; i < ne00; i += 32) {
            sumf += (float) x[i] * (float) y[i];
        }
        float all_sum = simd_sum(sumf);
        if (tiisg == 0) {
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    } else {
        device const half4  * x4 = (device const half4  *) x;
        device const float4 * y4 = (device const float4 *) y;
        for (int i = tiisg; i < ne00/4; i += 32) {
            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
        }
        float all_sum = simd_sum(sumf);
        if (tiisg == 0) {
            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    }
 }
 #define N_F16_F32 4
 kernel void kernel_mul_mat_f16_f32(
        device const  char * src0,
        device const  char * src1,
@ -442,40 +579,60 @@ kernel void kernel_mul_mat_f16_f32(
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        threadgroup float  * sum [[threadgroup(0)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tpig[[thread_position_in_grid]],
+        uint tiisg[[thread_index_in_simdgroup]]) {
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3  tptg[[threads_per_threadgroup]]) {
    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
+    const int64_t rb = tgpig.y*N_F16_F32;
    const int64_t im = tgpig.z;
-    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
+    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-    sum[tpitg.x] = 0.0f;
+    if (ne00 < 128) {
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
-    for (int i = tpitg.x; i < ne00; i += tptg.x) {
+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
        sum[tpitg.x] += (float) x[i] * (float) y[i];
    }
-    // accumulate the sum from all threads in the threadgroup
+            float sumf = 0;
-    threadgroup_barrier(mem_flags::mem_threadgroup);
+            for (int i = tiisg; i < ne00; i += 32) {
-    for (uint i = tptg.x/2; i > 0; i /= 2) {
+                sumf += (float) x[i] * (float) y[i];
-        if (tpitg.x < i) {
+            }
-            sum[tpitg.x] += sum[tpitg.x + i];
+
            float all_sum = simd_sum(sumf);
            if (tiisg == 0) {
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    } else {
        device const half4 * x4 = (device const half4 *)x;
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
            device const float4 * y4 = (device const float4 *) y;
            float sumf = 0;
            for (int i = tiisg; i < ne00/4; i += 32) {
                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
            }
            float all_sum = simd_sum(sumf);
            if (tiisg == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
    }
    if (tpitg.x == 0) {
        dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0];
    }
 }
 kernel void kernel_alibi_f32(
        device const float * src0,
        device       float * dst,
@ -571,7 +728,25 @@ kernel void kernel_rope(
            dst_data[1] = x0*sin_theta + x1*cos_theta;
        }
    } else {
-        // TODO: implement
+        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
            for (int64_t ic = 0; ic < n_dims; ic += 2) {
                const float cos_theta = cos(theta);
                const float sin_theta = sin(theta);
                theta *= theta_scale;
                const int64_t i0 = ib*n_dims + ic/2;
                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                const float x0 = src[0];
                const float x1 = src[n_dims/2];
                dst_data[0]        = x0*cos_theta - x1*sin_theta;
                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
            }
        }
    }
 }
@ -1154,7 +1329,8 @@ kernel void kernel_mul_mat_q4_K_f32(
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int r2 = tgpig.z;
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
    const int first_row = r0 * N_DST;
    const int ib_row = first_row * nb;
    const uint offset0 = r2/gqa*(nb*ne0);
    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
@ -1598,12 +1774,12 @@ template <typename type4x4>
 void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
    const half d = il ? (xb->d / 16.h) : xb->d;
-    const half m = il ? (-8.h * 16.h) : -8.h;
+    const half m = il ? ( -8.h * 16.h) : -8.h;
    const ushort mask0 = il ? 0x00F0 : 0x000F;
    const ushort mask1 = il ? 0xF000 : 0x0F00;
    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)] = (((qs[i] & mask0)) + m) * d;
+        reg[i/2][2*(i%2)]   = (((qs[i] & mask0)     ) + m) * d;
        reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
    }
 }
@ -1617,11 +1793,21 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
    const ushort mask1 = il ? 0xF000 : 0x0F00;
    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)] = (((qs[i] & mask0)) * d) + m;
+        reg[i/2][2*(i%2)]   = (((qs[i] & mask0)     ) * d) + m;
        reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m;
    }
 }
 template <typename type4x4>
 void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
    device const int8_t * qs = ((device const int8_t *)xb->qs);
    const half d = xb->d;
    for (int i=0;i<16;i++) {
        reg[i/4][i%4] = (qs[i + 16*il] * d);
    }
 }
 template <typename type4x4>
 void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
    const half d = xb->d;
@ -1850,6 +2036,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        //load data and store to threadgroup memory
        half4x4 temp_a;
        dequantize_func(x, il, temp_a);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        #pragma unroll(16)
        for (int i = 0; i < 16; i++) {
            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@ -1895,14 +2082,14 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        }
    } else {
        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
        for (int i = 0; i < 8; i++) {
            threadgroup_barrier(mem_flags::mem_device);
            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }
-        threadgroup_barrier(mem_flags::mem_device);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
        if (sgitg==0) {
            for (int i = 0; i < n_rows; i++) {
@ -1923,9 +2110,10 @@ kernel void kernel_mul_mm(device const  uchar * src0,
 typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
                          constant uint64_t &, constant uint64_t &, uint, uint, uint);
-template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
+template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
 template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
 template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
@ -1936,9 +2124,10 @@ typedef void (mat_mm_t)(device const uchar *, device const float *, device float
                             constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
                             constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
-template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
+template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1, dequantize_f16>;
 template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
 template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
 template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
 template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
 template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1493,7 +1493,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
        d_X = (cl_mem) src0->data;
    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -130,13 +130,16 @@
 // The data of the tensor is accessed via the "data" pointer. For example:
 //
 //   {
-//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+//       const int nx = 2;
 //       const int ny = 3;
 //
-//       // a[2, 1] = 1.0f;
+//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
 //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
 //
-//       // a[0, 2] = 2.0f;
+//       for (int y = 0; y < ny; y++) {
-//       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
+//           for (int x = 0; x < nx; x++) {
 //               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
 //           }
 //       }
 //
 //       ...
 //   }
@ -207,14 +210,24 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
 #if UINTPTR_MAX == 0xFFFFFFFF
    #define GGML_MEM_ALIGN 4
 #else
    #define GGML_MEM_ALIGN 16
 #endif
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 #define GGUF_MAGIC   0x46554747 // "GGUF"
 #define GGUF_VERSION 2
 #define GGUF_DEFAULT_ALIGNMENT 32
 #define GGML_UNUSED(x) (void)(x)
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -255,8 +268,9 @@
 extern "C" {
 #endif
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
+#if defined(__ARM_NEON) && defined(__CUDACC__)
-    // we use the built-in 16-bit float type
+    typedef half ggml_fp16_t;
 #elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -340,10 +354,12 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_CONCAT,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
        GGML_OP_GROUP_NORM,
        GGML_OP_MUL_MAT,
        GGML_OP_OUT_PROD,
@ -369,14 +385,19 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
        GGML_OP_UNARY,
@ -458,6 +479,9 @@ extern "C" {
        int64_t perf_cycles;
        int64_t perf_time_us;
        struct ggml_tensor * view_src;
        size_t               view_offs;
        void * data;
        char name[GGML_MAX_NAME];
@ -562,6 +586,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
    GGML_API int     ggml_blck_size (enum ggml_type type);
@ -639,7 +664,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
@ -799,6 +824,13 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // concat a and b on dim 2
    // used in stable-diffusion
    GGML_API struct ggml_tensor * ggml_concat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -888,14 +920,15 @@ extern "C" {
            struct ggml_tensor  * b);
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
            float                 eps);
    GGML_API struct ggml_tensor * ggml_norm_inplace(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
            float                 eps);
    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
@ -907,13 +940,26 @@ extern "C" {
            struct ggml_tensor  * a,
            float                 eps);
    // group normalize along ne0*ne1*n_groups
    // used in stable-diffusion
    // TODO: eps is hardcoded to 1e-6 for now
    GGML_API struct ggml_tensor * ggml_group_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_groups);
    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_groups);
    // a - x
    // b - dy
    // TODO: update with configurable eps
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
            float                 eps);
    // A: n columns, m rows
    // B: n columns, p rows  (i.e. we transpose it internally)
@ -1207,6 +1253,15 @@ extern "C" {
            float                 freq_base,
            float                 freq_scale);
    // xPos RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            float                 base,
            bool                  down);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
@ -1215,7 +1270,11 @@ extern "C" {
            int                   n_past,
            int                   n_dims,
            int                   mode,
-            int                   n_ctx);
+            int                   n_ctx,
            float                 freq_base,
            float                 freq_scale,
            float                 xpos_base,
            bool                  xpos_down);
    // alibi position embedding
    // in-place, returns view(a)
@ -1242,6 +1301,15 @@ extern "C" {
            int                   p0,  // padding
            int                   d0); // dilation
    // conv_1d with padding = half
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int                   s,
            int                   d);
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1253,14 +1321,38 @@ extern "C" {
            int                   d0,
            int                   d1);
-    // conv_1d with padding = half
+
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    // kernel size is a->ne[0] x a->ne[1]
-    GGML_API struct ggml_tensor * ggml_conv_1d_ph(
+    // stride is equal to kernel size
    // padding is zero
    // example:
    // a:     16   16    3  768
    // b:   1024 1024    3    1
    // res:   64   64  768    1
    // used in sam
    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // kernel size is a->ne[0] x a->ne[1]
    // stride is 1
    // padding is half
    // example:
    // a:      3    3    256  256
    // b:     64   64    256    1
    // res:   64   64    256    1
    // used in sam
    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            int                   s,
+            int                   stride);
            int                   d);
    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
@ -1287,6 +1379,13 @@ extern "C" {
            int                   p0,
            int                   p1);
    // nearest interpolate
    // used in stable-diffusion
    GGML_API struct ggml_tensor * ggml_upscale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   scale_factor);
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@ -1340,6 +1439,27 @@ extern "C" {
        struct ggml_tensor  * a,
        enum ggml_unary_op op);
    // used in sam
    GGML_API struct ggml_tensor * ggml_get_rel_pos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   qh,
            int                   kh);
    // used in sam
    GGML_API struct ggml_tensor * ggml_add_rel_pos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);
    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);
    // custom operators
    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -1495,7 +1615,8 @@ extern "C" {
            struct ggml_tensor  * tensor);
-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
@ -1560,6 +1681,8 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };
    typedef void (*ggml_opt_callback)(void * data, float * sched);
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
@ -1595,12 +1718,14 @@ extern "C" {
            float sched; // schedule multiplier (fixed, decay or warmup)
            float decay; // weight decay for AdamW, use 0.0f to disable
            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
            float gclip; // gradient clipping
        } adam;
        // LBFGS parameters
@ -1628,14 +1753,12 @@ extern "C" {
        bool just_initialized;
        float loss_before;
        float loss_after;
        struct {
            struct ggml_tensor * x;  // view of the parameters
            struct ggml_tensor * g1; // gradient
            struct ggml_tensor * g2; // gradient squared
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * mh; // first moment hat
            struct ggml_tensor * vh; // second moment hat
            struct ggml_tensor * pf; // past function values
            float fx_best;
            float fx_prev;
@ -1672,10 +1795,10 @@ extern "C" {
    // initialize optimizer context
    GGML_API void ggml_opt_init(
-            struct ggml_context * ctx,
+            struct ggml_context     * ctx,
            struct ggml_opt_context * opt,
-            struct ggml_opt_params params,
+            struct ggml_opt_params    params,
-            int64_t nx);
+            int64_t                   nx);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume(
@ -1689,7 +1812,9 @@ extern "C" {
            struct ggml_opt_context * opt,
            struct ggml_tensor * f,
            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb);
+            struct ggml_cgraph * gb,
            ggml_opt_callback callback,
            void * callback_data);
    //
    // quantization
@ -1703,6 +1828,127 @@ extern "C" {
    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
    //
    // gguf
    //
    enum gguf_type {
        GGUF_TYPE_UINT8   = 0,
        GGUF_TYPE_INT8    = 1,
        GGUF_TYPE_UINT16  = 2,
        GGUF_TYPE_INT16   = 3,
        GGUF_TYPE_UINT32  = 4,
        GGUF_TYPE_INT32   = 5,
        GGUF_TYPE_FLOAT32 = 6,
        GGUF_TYPE_BOOL    = 7,
        GGUF_TYPE_STRING  = 8,
        GGUF_TYPE_ARRAY   = 9,
        GGUF_TYPE_UINT64  = 10,
        GGUF_TYPE_INT64   = 11,
        GGUF_TYPE_FLOAT64 = 12,
        GGUF_TYPE_COUNT,       // marks the end of the enum
    };
    struct gguf_context;
    struct gguf_init_params {
        bool no_alloc;
        // if not NULL, create a ggml_context and allocate the tensor data in it
        struct ggml_context ** ctx;
    };
    GGML_API struct gguf_context * gguf_init_empty(void);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
    GGML_API void gguf_free(struct gguf_context * ctx);
    GGML_API const char * gguf_type_name(enum gguf_type type);
    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
    // results are undefined if the wrong type is used for the key
    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
    GGML_API uint64_t     gguf_get_val_u64 (struct gguf_context * ctx, int i);
    GGML_API int64_t      gguf_get_val_i64 (struct gguf_context * ctx, int i);
    GGML_API double       gguf_get_val_f64 (struct gguf_context * ctx, int i);
    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
    // set or add KV pairs from another context
    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
    // manage tensor info
    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
    // writing gguf files can be done in 2 ways:
    //
    // - write the entire gguf_context to a binary file in a single pass:
    //
    //   gguf_write_to_file(ctx, fname);
    //
    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
    //
    //   FILE * f = fopen(fname, "wb");
    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
    //   fwrite(f, ...);
    //   void * data = gguf_meta_get_meta_data(ctx);
    //   fseek(f, 0, SEEK_SET);
    //   fwrite(f, data, gguf_get_meta_size(ctx));
    //   free(data);
    //   fclose(f);
    //
    // write the entire context to a binary file
    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
    //
    // system info
    //
@ -1723,6 +1969,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
--- a/gguf-py/LICENSE
+++ b/gguf-py/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2023 Georgi Gerganov
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -0,0 +1,72 @@
 ## gguf
 This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
 (GGML Universal File) format.
 See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
 as an example for its usage.
 ## Installation
 ```sh
 pip install gguf
 ```
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:
 ```sh
 cd /path/to/llama.cpp/gguf-py
 pip install --editable .
 ```
 **Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
 In this case, upgrade Pip to the latest:
 ```sh
 pip install --upgrade pip
 ```
 ## Automatic publishing with CI
 There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
 1. Bump the version in `pyproject.toml`.
 2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
 ```sh
 git tag -a gguf-v1.0.0 -m "Version 1.0 release"
 ```
 3. Push the tags.
 ```sh
 git push origin --tags
 ```
 ## Manual publishing
 If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
 ```sh
 pip install build twine
 ```
 Then, folow these steps to release a new version:
 1. Bump the version in `pyproject.toml`.
 2. Build the package:
 ```sh
 python -m build
 ```
 3. Upload the generated distribution archives:
 ```sh
 python -m twine upload dist/*
 ```
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
 - Add CI workflow for releasing the package.
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@ -0,0 +1 @@
 from .gguf import *
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -0,0 +1,860 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import json
 import os
 import shutil
 import struct
 import sys
 import tempfile
 from enum import IntEnum, auto
 from io import BufferedWriter
 from pathlib import Path
 from typing import IO, Any, BinaryIO, Callable, Sequence
 import numpy as np
 #
 # constants
 #
 GGUF_MAGIC             = 0x46554747
 GGUF_VERSION           = 2
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
 KEY_GENERAL_ALIGNMENT            = "general.alignment"
 KEY_GENERAL_NAME                 = "general.name"
 KEY_GENERAL_AUTHOR               = "general.author"
 KEY_GENERAL_URL                  = "general.url"
 KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
 KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"
 # LLM
 KEY_CONTEXT_LENGTH        = "{arch}.context_length"
 KEY_EMBEDDING_LENGTH      = "{arch}.embedding_length"
 KEY_BLOCK_COUNT           = "{arch}.block_count"
 KEY_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
 KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
 KEY_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
 # attention
 KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
 KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
 KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
 KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
 KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
 KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
 KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
 KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
 KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
 KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
 KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
 KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
 KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
 KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
 KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
 KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 class MODEL_ARCH(IntEnum):
    LLAMA  : int = auto()
    FALCON : int = auto()
    GPT2   : int = auto()
    GPTJ   : int = auto()
    GPTNEOX: int = auto()
    MPT    : int = auto()
 class MODEL_TENSOR(IntEnum):
    TOKEN_EMBD   : int = auto()
    POS_EMBD     : int = auto()
    OUTPUT       : int = auto()
    OUTPUT_NORM  : int = auto()
    ROPE_FREQS   : int = auto()
    ATTN_Q       : int = auto()
    ATTN_K       : int = auto()
    ATTN_V       : int = auto()
    ATTN_QKV     : int = auto()
    ATTN_OUT     : int = auto()
    ATTN_NORM    : int = auto()
    ATTN_NORM_2  : int = auto()
    ATTN_ROT_EMBD: int = auto()
    FFN_GATE     : int = auto()
    FFN_DOWN     : int = auto()
    FFN_UP       : int = auto()
    FFN_NORM     : int = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLAMA:   "llama",
    MODEL_ARCH.FALCON:  "falcon",
    MODEL_ARCH.GPT2:    "gpt2",
    MODEL_ARCH.GPTJ:    "gptj",
    MODEL_ARCH.GPTNEOX: "gptneox",
    MODEL_ARCH.MPT:     "mpt",
 }
 MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
    MODEL_ARCH.LLAMA: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
        MODEL_TENSOR.OUTPUT:        "output",
        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.GPTNEOX: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
        MODEL_TENSOR.OUTPUT:        "output",
        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.FALCON: {
        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
        MODEL_TENSOR.OUTPUT:      "output",
        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
    },
    MODEL_ARCH.GPT2: {
        # TODO
    },
    # TODO
 }
 # tensors that will not be serialized
 MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
 }
 class TensorNameMap:
    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
            "gpt_neox.embed_in",           # gptneox
            "transformer.wte",             # gpt2 mpt
            "transformer.word_embeddings", # falcon
            "model.embed_tokens",          # llama-hf
            "tok_embeddings",              # llama-pth
        ),
        # Position embeddings
        MODEL_TENSOR.POS_EMBD: (
            "transformer.wpe", # gpt2
        ),
        # Output
        MODEL_TENSOR.OUTPUT: (
            "embed_out", # gptneox
            "lm_head",   # gpt2 mpt falcon llama-hf
            "output",    # llama-pth
        ),
        # Output norm
        MODEL_TENSOR.OUTPUT_NORM: (
            "gpt_neox.final_layer_norm", # gptneox
            "transformer.ln_f",          # gpt2 falcon
            "model.norm",                # llama-hf
            "norm",                      # llama-pth
        ),
        # Rope frequencies
        MODEL_TENSOR.ROPE_FREQS: (
            "rope.freqs", # llama-pth
        ),
    }
    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Attention norm
        MODEL_TENSOR.ATTN_NORM: (
            "gpt_neox.layers.{bid}.input_layernorm", # gptneox
            "transformer.h.{bid}.ln_1",              # gpt2
            "transformer.blocks.{bid}.norm_1",       # mpt
            "transformer.h.{bid}.input_layernorm",   # falcon7b
            "transformer.h.{bid}.ln_mlp",            # falcon40b
            "model.layers.{bid}.input_layernorm",    # llama-hf
            "layers.{bid}.attention_norm",           # llama-pth
        ),
        # Attention norm 2
        MODEL_TENSOR.ATTN_NORM_2: (
            "transformer.h.{bid}.ln_attn", # falcon40b
        ),
        # Attention query-key-value
        MODEL_TENSOR.ATTN_QKV: (
            "gpt_neox.layers.{bid}.attention.query_key_value",    # gptneox
            "transformer.h.{bid}.attn.c_attn",                    # gpt2
            "transformer.blocks.{bid}.attn.Wqkv",                 # mpt
            "transformer.h.{bid}.self_attention.query_key_value", # falcon
        ),
        # Attention query
        MODEL_TENSOR.ATTN_Q: (
            "model.layers.{bid}.self_attn.q_proj", # llama-hf
            "layers.{bid}.attention.wq",           # llama-pth
        ),
        # Attention key
        MODEL_TENSOR.ATTN_K: (
            "model.layers.{bid}.self_attn.k_proj", # llama-hf
            "layers.{bid}.attention.wk",           # llama-pth
        ),
        # Attention value
        MODEL_TENSOR.ATTN_V: (
            "model.layers.{bid}.self_attn.v_proj", # llama-hf
            "layers.{bid}.attention.wv",           # llama-pth
        ),
        # Attention output
        MODEL_TENSOR.ATTN_OUT: (
            "gpt_neox.layers.{bid}.attention.dense",    # gptneox
            "transformer.h.{bid}.attn.c_proj",          # gpt2
            "transformer.blocks.{bid}.attn.out_proj",   # mpt
            "transformer.h.{bid}.self_attention.dense", # falcon
            "model.layers.{bid}.self_attn.o_proj",      # llama-hf
            "layers.{bid}.attention.wo",                # llama-pth
        ),
        # Rotary embeddings
        MODEL_TENSOR.ATTN_ROT_EMBD: (
            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",  # llama-hf
            "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
        ),
        # Feed-forward norm
        MODEL_TENSOR.FFN_NORM: (
            "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
            "transformer.h.{bid}.ln_2",                       # gpt2
            "transformer.blocks.{bid}.norm_2",                # mpt
            "model.layers.{bid}.post_attention_layernorm",    # llama-hf
            "layers.{bid}.ffn_norm",                          # llama-pth
        ),
        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
            "transformer.h.{bid}.mlp.c_fc",            # gpt2
            "transformer.blocks.{bid}.ffn.up_proj",    # mpt
            "transformer.h.{bid}.mlp.dense_h_to_4h",   # falcon
            "model.layers.{bid}.mlp.up_proj",          # llama-hf
            "layers.{bid}.feed_forward.w3",            # llama-pth
        ),
        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
            "model.layers.{bid}.mlp.gate_proj", # llama-hf
            "layers.{bid}.feed_forward.w1",     # llama-pth
        ),
        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
            "transformer.h.{bid}.mlp.c_proj",          # gpt2
            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
            "transformer.h.{bid}.mlp.dense_4h_to_h",   # falcon
            "model.layers.{bid}.mlp.down_proj",        # llama-hf
            "layers.{bid}.feed_forward.w2",            # llama-pth
        ),
    }
    mapping: dict[str, tuple[MODEL_TENSOR, str]]
    tensor_names: dict[MODEL_TENSOR, str]
    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
        mapping = self.mapping = {}
        tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
        for tensor, keys in self.mappings_cfg.items():
            tensor_name = tensor_names.get(tensor)
            if tensor_name is None:
                continue
            for key in keys:
                mapping[key] = (tensor, tensor_name)
        for bid in range(n_blocks):
            for tensor, keys in self.block_mappings_cfg.items():
                tensor_name = tensor_names.get(tensor)
                if tensor_name is None:
                    continue
                tensor_name = tensor_name.format(bid = bid)
                for key in keys:
                    key = key.format(bid = bid)
                    mapping[key] = (tensor, tensor_name)
    def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
        if result is not None:
            return result
        for suffix in try_suffixes:
            if key.endswith(suffix):
                result = self.mapping.get(key[:-len(suffix)])
                if result is not None:
                    return (result[0], result[1] + suffix)
        return None
    def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
        if result is None:
            return None
        return result[1]
    def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
        if result is None:
            return None
        return result[0]
    def __getitem__(self, key: str) -> str:
        try:
            return self.mapping[key][1]
        except KeyError:
            raise KeyError(key)
    def __contains__(self, key: str) -> bool:
        return key in self.mapping
    def __repr__(self) -> str:
        return repr(self.mapping)
 def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
    return TensorNameMap(arch, n_blocks)
 class TokenType(IntEnum):
    NORMAL       = 1
    UNKNOWN      = 2
    CONTROL      = 3
    USER_DEFINED = 4
    UNUSED       = 5
    BYTE         = 6
 #
 # implementation
 #
 class GGMLQuantizationType(IntEnum):
    F32  = 0
    F16  = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8
    Q8_1 = 9
    Q2_K = 10
    Q3_K = 11
    Q4_K = 12
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15
 class GGUFValueType(IntEnum):
    UINT8   = 0
    INT8    = 1
    UINT16  = 2
    INT16   = 3
    UINT32  = 4
    INT32   = 5
    FLOAT32 = 6
    BOOL    = 7
    STRING  = 8
    ARRAY   = 9
    UINT64  = 10
    INT64   = 11
    FLOAT64 = 12
    @staticmethod
    def get_type(val):
        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
            return GGUFValueType.STRING
        elif isinstance(val, list):
            return GGUFValueType.ARRAY
        elif isinstance(val, float):
            return GGUFValueType.FLOAT32
        elif isinstance(val, bool):
            return GGUFValueType.BOOL
        elif isinstance(val, int):
            return GGUFValueType.INT32
        # TODO: need help with 64-bit types in Python
        else:
            print("Unknown type: "+str(type(val)))
            sys.exit()
 class GGUFWriter:
    fout: BufferedWriter
    arch: str
    offset_tensor = 0
    data_alignment = GGUF_DEFAULT_ALIGNMENT
    kv_data = b""
    kv_data_count = 0
    ti_data = b""
    ti_data_count = 0
    use_temp_file: bool
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
    tensors: list[tuple[np.ndarray[Any, Any], int]]
    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
        self.fout.write(struct.pack("<I", GGUF_VERSION))
        self.fout.write(struct.pack("<Q", self.ti_data_count))
        self.fout.write(struct.pack("<Q", self.kv_data_count))
        self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
    def write_kv_data_to_file(self):
        self.fout.write(self.kv_data)
        self.flush()
    def write_ti_data_to_file(self):
        self.fout.write(self.ti_data)
        self.flush()
    def add_key(self, key: str):
        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
    def add_uint8(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT8)
    def add_int8(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT8)
    def add_uint16(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT16)
    def add_int16(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT16)
    def add_uint32(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT32)
    def add_int32(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT32)
    def add_float32(self, key: str, val: float):
        self.add_key(key)
        self.add_val(val, GGUFValueType.FLOAT32)
    def add_uint64(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.UINT64)
    def add_int64(self, key: str, val: int):
        self.add_key(key)
        self.add_val(val, GGUFValueType.INT64)
    def add_float64(self, key: str, val: float):
        self.add_key(key)
        self.add_val(val, GGUFValueType.FLOAT64)
    def add_bool(self, key: str, val: bool):
        self.add_key(key)
        self.add_val(val, GGUFValueType.BOOL)
    def add_string(self, key: str, val: str):
        if len(val) == 0:
            return
        self.add_key(key)
        self.add_val(val, GGUFValueType.STRING)
    def add_array(self, key: str, val: Sequence[Any]):
        if not isinstance(val, Sequence):
            raise ValueError("Value must be a sequence for array type")
        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)
    _simple_value_packing = {
        GGUFValueType.UINT8:   "<B",
        GGUFValueType.INT8:    "<b",
        GGUFValueType.UINT16:  "<H",
        GGUFValueType.INT16:   "<h",
        GGUFValueType.UINT32:  "<I",
        GGUFValueType.INT32:   "<i",
        GGUFValueType.FLOAT32: "<f",
        GGUFValueType.UINT64:  "<Q",
        GGUFValueType.INT64:   "<q",
        GGUFValueType.FLOAT64: "<d",
        GGUFValueType.BOOL:    "?" ,
    }
    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)
        if add_vtype:
            self.kv_data += struct.pack("<I", vtype)
            self.kv_data_count += 1
        pack_fmt = self._simple_value_packing.get(vtype)
        if pack_fmt is not None:
            self.kv_data += struct.pack(pack_fmt, val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
            self.kv_data += struct.pack("<Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
            ltype = GGUFValueType.get_type(val[0])
            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                raise ValueError("All items in a GGUF array should be of the same type")
            self.kv_data += struct.pack("<I", ltype)
            self.kv_data += struct.pack("<Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
            raise ValueError("Invalid GGUF metadata value type or value")
    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n
    def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
        encoded_name = name.encode("utf8")
        self.ti_data += struct.pack("<Q", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
        self.ti_data += struct.pack("<I", n_dims)
        for i in range(n_dims):
            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
        else:
            dtype = raw_dtype
        self.ti_data += struct.pack("<I", dtype)
        self.ti_data += struct.pack("<Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1
    def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
        if self.use_temp_file and self.temp_file is None:
            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
            fp.seek(0)
            self.temp_file = fp
        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
        if  self.temp_file is None:
            self.tensors.append((tensor, pad))
            return
        tensor.tofile(self.temp_file)
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))
    def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
        if pad != 0:
            fp.write(bytes([0] * pad))
    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
        self.write_padding(self.fout, self.fout.tell())
        tensor.tofile(self.fout)
        self.write_padding(self.fout, tensor.nbytes)
    def write_tensors_to_file(self):
        self.write_ti_data_to_file()
        self.write_padding(self.fout, self.fout.tell())
        if self.temp_file is None:
            for (currtensor, currpad) in self.tensors:
                currtensor.tofile(self.fout)
                if currpad != 0:
                    self.fout.write(bytes([0] * currpad))
            return
        self.temp_file.seek(0)
        shutil.copyfileobj(self.temp_file, self.fout)
        self.flush()
        self.temp_file.close()
    def flush(self):
        self.fout.flush()
    def close(self):
        self.fout.close()
    def add_architecture(self):
        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
    def add_author(self, author: str):
        self.add_string(KEY_GENERAL_AUTHOR, author)
    def add_tensor_data_layout(self, layout: str):
        self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
    def add_url(self, url: str):
        self.add_string(KEY_GENERAL_URL, url)
    def add_description(self, description: str):
        self.add_string(KEY_GENERAL_DESCRIPTION, description)
    def add_source_url(self, url: str):
        self.add_string(KEY_GENERAL_SOURCE_URL, url)
    def add_source_hf_repo(self, repo: str):
        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
    def add_file_type(self, ftype: int):
        self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
    def add_name(self, name: str):
        self.add_string(KEY_GENERAL_NAME, name)
    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
        self.add_uint32(
            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
    def add_custom_alignment(self, alignment: int):
        self.data_alignment = alignment
        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
    def add_context_length(self, length: int):
        self.add_uint32(
            KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
    def add_embedding_length(self, length: int):
        self.add_uint32(
            KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
    def add_block_count(self, length: int):
        self.add_uint32(
            KEY_BLOCK_COUNT.format(arch=self.arch), length)
    def add_feed_forward_length(self, length: int):
        self.add_uint32(
            KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
    def add_parallel_residual(self, use: bool):
        self.add_bool(
            KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
    def add_head_count(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
    def add_head_count_kv(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
    def add_max_alibi_bias(self, bias: float):
        self.add_float32(
            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
    def add_clamp_kqv(self, value: float):
        self.add_float32(
            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
    def add_layer_norm_eps(self, value: float):
        self.add_float32(
            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
    def add_layer_norm_rms_eps(self, value: float):
        self.add_float32(
            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
    def add_rope_dimension_count(self, count: int):
        self.add_uint32(
            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
    def add_rope_freq_base(self, value: float):
        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
    def add_rope_scale_linear(self, value: float):
        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)
    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
        self.add_array(KEY_TOKENIZER_LIST, tokens)
    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
        self.add_array(KEY_TOKENIZER_MERGES, merges)
    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
    def add_token_scores(self, scores: Sequence[float]):
        self.add_array(KEY_TOKENIZER_SCORES, scores)
    def add_bos_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
    def add_eos_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
    def add_unk_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
    def add_sep_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
    def add_pad_token_id(self, id: int):
        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
 class SpecialVocab:
    load_merges: bool = False
    merges: list[str] = []
    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
    special_token_ids: dict[str, int] = {}
    def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
        self.special_token_ids = {}
        self.load_merges = load_merges
        if special_token_types is not None:
            self.special_token_types = special_token_types
        self.load(path)
    def load(self, path: Path):
        if not self.try_load_from_tokenizer_json(path):
            self.try_load_from_config_json(path)
    def try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
        if not tokenizer_file.is_file():
            return False
        with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
            tokenizer = json.load(f)
        if self.load_merges:
            merges = tokenizer.get('model', {}).get('merges')
            if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
                self.merges = merges
        tokenizer_config_file = path / 'tokenizer_config.json'
        added_tokens = tokenizer.get('added_tokens')
        if added_tokens is None or not tokenizer_config_file.is_file():
            return True
        with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
            tokenizer_config = json.load(f)
        for typ in self.special_token_types:
            entry = tokenizer_config.get(f'{typ}_token')
            if isinstance(entry, str):
                tc_content = entry
            elif isinstance(entry, dict):
                entry_content = entry.get('content')
                if not isinstance(entry_content, str):
                    continue
                tc_content = entry_content
            else:
                continue
            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
                    self.special_token_ids[typ] = maybe_token_id
                break
        return True
    def try_load_from_config_json(self, path: Path) -> bool:
        config_file = path / 'config.json'
        if not config_file.is_file():
            return False
        with open(config_file, 'r', encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
            maybe_token_id = config.get(f'{typ}_token_id')
            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
                self.special_token_ids[typ] = maybe_token_id
        return True
    def add_to_gguf(self, gw: GGUFWriter):
        if len(self.merges) > 0:
            print(f'gguf: Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
        for typ, tokid in self.special_token_ids.items():
            handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
            if handler is None:
                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
                continue
            print(f'gguf: Setting special token type {typ} to {tokid}')
            handler(tokid)
    def __repr__(self):
        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
 # Example usage:
 if __name__ == "__main__":
    # Example usage with a file
    gguf_writer = GGUFWriter("example.gguf", "llama")
    gguf_writer.add_architecture()
    gguf_writer.add_block_count(12)
    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
    gguf_writer.add_custom_alignment(64)
    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
    gguf_writer.add_tensor("tensor1", tensor1)
    gguf_writer.add_tensor("tensor2", tensor2)
    gguf_writer.add_tensor("tensor3", tensor3)
    gguf_writer.write_header_to_file()
    gguf_writer.write_kv_data_to_file()
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
--- a/gguf-py/gguf/py.typed
+++ b/gguf-py/gguf/py.typed
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -0,0 +1,29 @@
 [tool.poetry]
 name = "gguf"
 version = "0.3.2"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
    {include = "gguf"},
    {include = "gguf/py.typed"},
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
 repository = "https://github.com/ggerganov/llama.cpp"
 keywords = ["ggml", "gguf", "llama.cpp"]
 classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
 ]
 [tool.poetry.dependencies]
 python = ">=3.8"
 numpy = ">=1.17"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
--- a/gguf-py/tests/test_gguf.py
+++ b/gguf-py/tests/test_gguf.py
@ -0,0 +1,7 @@
 import gguf
 # TODO: add tests
 def test_write_gguf():
    pass
--- a/grammars/README.md
+++ b/grammars/README.md
@ -0,0 +1,91 @@
 # GBNF Guide
 GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
 ## Background
 [Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
 ## Basics
 In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
 ## Example
 Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
 ```
 # `root` specifies the pattern for the overall output
 root ::= (
    # it must start with the characters "1. " followed by a sequence
    # of characters that match the `move` rule, followed by a space, followed
    # by another move, and then a newline
    "1. " move " " move "\n"
    # it's followed by one or more subsequent moves, numbered with one or two digits
    ([1-9] [0-9]? ". " move " " move "\n")+
 )
 # `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
 # The `[+#]?` denotes the possibility of checking or mate signs after moves
 move ::= (pawn | nonpawn | castle) [+#]?
 pawn ::= ...
 nonpawn ::= ...
 castle ::= ...
 ```
 ## Non-Terminals and Terminals
 Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
 Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
 ## Characters and character ranges
 Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
 Character ranges can be negated with `^`:
 ```
 single-line ::= [^\n]+ "\n"`
 ```
 ## Sequences and Alternatives
 The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
 Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
 Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
 ## Repetition and Optional Symbols
 - `*` after a symbol or sequence means that it can be repeated zero or more times.
 - `+` denotes that the symbol or sequence should appear one or more times.
 - `?` makes the preceding symbol or sequence optional.
 ## Comments and newlines
 Comments can be specified with `#`:
 ```
 # defines optional whitspace
 ws ::= [ \t\n]+
 ```
 Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
 ## The root rule
 In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
 ```
 # a grammar for lists
 root ::= ("- " item)+
 item ::= [^\n]+ "\n"
 ```
 ## Next steps
 This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
 ```
 ./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
 ```
--- a/grammars/c.gbnf
+++ b/grammars/c.gbnf
@ -0,0 +1,42 @@
 root ::= (declaration)*
 declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
 dataType  ::= "int" ws | "float" ws | "char" ws
 identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
 parameter ::= dataType identifier
 statement ::=
    ( dataType identifier ws "=" ws expression ";" ) |
    ( identifier ws "=" ws expression ";" ) |
    ( identifier ws "(" argList? ")" ";" ) |
    ( "return" ws expression ";" ) |
    ( "while" "(" condition ")" "{" statement* "}" ) |
    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
    ( singleLineComment ) |
    ( multiLineComment )
 forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
 forUpdate ::= identifier ws "=" ws expression
 condition ::= expression relationOperator expression
 relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
 expression ::= term (("+" | "-") term)*
 term ::= factor(("*" | "/") factor)*
 factor ::= identifier | number | unaryTerm | funcCall | parenExpression
 unaryTerm ::= "-" factor
 funcCall ::= identifier "(" argList? ")"
 parenExpression ::= "(" ws expression ws ")"
 argList ::= expression ("," ws expression)*
 number ::= [0-9]+
 singleLineComment ::= "//" [^\n]* "\n"
 multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
 ws ::= ([ \t\n]+)
--- a/Show more
+++ b/Show more
`@ -1,4 +1,3 @@`

	`#!/bin/bash`	`#!/bin/bash`

	cd `dirname $0`	cd `dirname $0`