added clinfo binary, cleanup unused stuff

2023-10-31 22:25:25 +08:00 · 2023-10-31 22:25:25 +08:00 · 43a5143450
commit 43a5143450
parent f3690ba6d2
51 changed files with 19 additions and 3397 deletions
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -1,22 +0,0 @@
-node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
-    stage('Cleanup'){
-        cleanWs()               // Cleaning previous CI build in workspace
-    }
-    stage('checkout repo'){
-        retry(5){               // Retry if the cloning fails due to some reason
-            checkout scm        // Clone the repo on Runner
-        }
-    }
-    stage('Compiling llama.cpp'){
-        sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
-        '''
-    }
-    stage('Running llama.cpp'){
-        sh'''#!/bin/bash
-            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
-        '''
-    }
-}
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -1,33 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
-
-COPY requirements.txt requirements.txt
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -1,44 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt requirements.txt
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -1,84 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-clblast
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        OpenCL Inference of LLaMA model in C/C++
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
-Requires:       clblast
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CLBLAST=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamaclblast
-cp -p server %{buildroot}%{_bindir}/llamaclblastserver
-cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamaclblast
-%{_bindir}/llamaclblastserver
-%{_bindir}/llamaclblastsimple
-/usr/lib/systemd/system/llamaclblast.service
-%config /etc/sysconfig/llama
-
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@ -1,83 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-cublas
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
-Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CUBLAS=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -1,85 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-#    In the meantime, YYYYMMDD format will be used.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
-Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-Models are not included in this package and must be downloaded separately.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llama
-cp -p server %{buildroot}%{_bindir}/llamaserver
-cp -p simple %{buildroot}%{_bindir}/llamasimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llama
-%{_bindir}/llamaserver
-%{_bindir}/llamasimple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -1,32 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
-
-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -1,44 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt requirements.txt
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT [ "/app/main" ]
--- a/.flake8
+++ b/.flake8
@ -1,2 +0,0 @@
-[flake8]
-max-line-length = 125
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,15 +0,0 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-exclude: prompts/.*.txt
-repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
-  hooks:
-  - id: trailing-whitespace
-  - id: end-of-file-fixer
-  - id: check-yaml
-  - id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
-  rev: 6.0.0
-  hooks:
-  -   id: flake8
--- a/19
+++ b/19
@ -0,0 +1,19 @@
+Windows binaries obtained from the clinfo repo fork here:
+
+https://github.com/ahoylabs/clinfo/releases/tag/master-d2baa06
+
+Source available here:
+https://github.com/Oblomov/clinfo
+
+see below LICENSE file for details on clinfo license
+
+=======
+
+clinfo by Giuseppe Bilotta
+
+To the extent possible under law, the person who associated CC0 with
+clinfo has waived all copyright and related or neighboring rights
+to clinfo.
+
+You should have received a copy of the CC0 legalcode along with this
+work.  If not, see <http://creativecommons.org/publicdomain/zero/1.0/>
--- a/Package.swift
+++ b/Package.swift
@ -1,64 +0,0 @@
-// swift-tools-version:5.5
-
-import PackageDescription
-
-#if arch(arm) || arch(arm64)
-let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
-    .iOS(.v14),
-    .watchOS(.v4),
-    .tvOS(.v14)
-]
-let exclude: [String] = []
-let resources: [Resource] = [
-    .process("ggml-metal.metal")
-]
-let additionalSources: [String] = ["ggml-metal.m"]
-let additionalSettings: [CSetting] = [
-    .unsafeFlags(["-fno-objc-arc"]),
-    .define("GGML_USE_METAL")
-]
-#else
-let platforms: [SupportedPlatform]? = nil
-let exclude: [String] = ["ggml-metal.metal"]
-let resources: [Resource] = []
-let additionalSources: [String] = []
-let additionalSettings: [CSetting] = []
-#endif
-
-let package = Package(
-    name: "llama",
-    platforms: platforms,
-    products: [
-        .library(name: "llama", targets: ["llama"]),
-    ],
-    targets: [
-        .target(
-            name: "llama",
-            path: ".",
-            exclude: exclude,
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-            ] + additionalSources,
-            resources: resources,
-            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE")
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ] + additionalSettings,
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
-        )
-    ],
-    cxxLanguageStandard: .cxx11
-)
--- a/ci/README.md
+++ b/ci/README.md
@ -1,25 +0,0 @@
-# CI
-
-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
-
-https://github.com/ggml-org/ci
-
-It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
-to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
-to cover various hardware architectures, including GPU and Apple Silicon instances.
-
-Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
-Only the branches of this repo are monitored for this keyword.
-
-It is a good practice, before publishing changes to execute the full CI locally on your machine:
-
-```bash
-mkdir tmp
-
-# CPU-only build
-bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-# with CUDA support
-GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -1,514 +0,0 @@
-#/bin/bash
-#
-# sample usage:
-#
-# mkdir tmp
-#
-# # CPU-only build
-# bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# # with CUDA support
-# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-
-if [ -z "$2" ]; then
-    echo "usage: $0 <output-dir> <mnt-dir>"
-    exit 1
-fi
-
-mkdir -p "$1"
-mkdir -p "$2"
-
-OUT=$(realpath "$1")
-MNT=$(realpath "$2")
-
-rm -v $OUT/*.log
-rm -v $OUT/*.exit
-rm -v $OUT/*.md
-
-sd=`dirname $0`
-cd $sd/../
-SRC=`pwd`
-
-## helpers
-
-# download a file if it does not exist or if it is outdated
-function gg_wget {
-    local out=$1
-    local url=$2
-
-    local cwd=`pwd`
-
-    mkdir -p $out
-    cd $out
-
-    # should not re-download if file is the same
-    wget -nv -N $url
-
-    cd $cwd
-}
-
-function gg_printf {
-    printf -- "$@" >> $OUT/README.md
-}
-
-function gg_run {
-    ci=$1
-
-    set -o pipefail
-    set -x
-
-    gg_run_$ci | tee $OUT/$ci.log
-    cur=$?
-    echo "$cur" > $OUT/$ci.exit
-
-    set +x
-    set +o pipefail
-
-    gg_sum_$ci
-
-    ret=$((ret | cur))
-}
-
-## ci
-
-# ctest_debug
-
-function gg_run_ctest_debug {
-    cd ${SRC}
-
-    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-
-    set +e
-}
-
-function gg_sum_ctest_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# ctest_release
-
-function gg_run_ctest_release {
-    cd ${SRC}
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    fi
-
-    set +e
-}
-
-function gg_sum_ctest_release {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in release mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-}
-
-# open_llama_3b_v2
-
-function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-
-    set +e
-}
-
-function gg_sum_open_llama_3b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 3B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
-}
-
-# open_llama_7b_v2
-# requires: GG_BUILD_CUDA
-
-function gg_run_open_llama_7b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/open-llama/7B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/7B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # currently not supported by the CUDA backend
-    # q8_0
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    set +e
-}
-
-function gg_sum_open_llama_7b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 7B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
-}
-
-## main
-
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    rm -rf ${SRC}/models-mnt
-
-    mnt_models=${MNT}/models
-    mkdir -p ${mnt_models}
-    ln -sfn ${mnt_models} ${SRC}/models-mnt
-
-    python3 -m pip install -r ${SRC}/requirements.txt
-    python3 -m pip install --editable gguf-py
-fi
-
-ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
-test $ret -eq 0 && gg_run ctest_release
-
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
-        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-    fi
-fi
-
-exit $ret
--- a/clinfo_win.exe
+++ b/clinfo_win.exe
--- a/codecov.yml
+++ b/codecov.yml
@ -1,14 +0,0 @@
-comment: off
-
-coverage:
-  status:
-    project:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
-    patch:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@ -1,40 +0,0 @@
-# Token generation performance troubleshooting
-
-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
-```shell
-./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
-```
-
-When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
-```shell
-llama_model_load_internal: [cublas] offloading 60 layers to GPU
-llama_model_load_internal: [cublas] offloading output layer to GPU
-llama_model_load_internal: [cublas] total VRAM used: 17223 MB
-... rest of inference
-```
-
-If you see these lines, then the GPU is being used.
-
-## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
-
-# Example of runtime flags effect on inference speed benchmark
-These runs were tested on the following machine:
-GPU: A6000 (48GB VRAM)
-CPU: 7 physical cores
-RAM: 32GB
-
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-
-Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
-
-Result:
-
-| command | tokens/second (higher is better) |
-| - | - |
-| -ngl 2000000 | N/A (less than 0.1) |
-| -t 7 | 1.7 |
-| -t 1 -ngl 2000000 | 5.5 |
-| -t 7 -ngl 2000000 | 8.7 |
-| -t 4 -ngl 2000000 | 9.1 |
--- a/grammars/README.md
+++ b/grammars/README.md
@ -1,91 +0,0 @@
-# GBNF Guide
-
-GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
-
-## Background
-
-[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
-
-## Basics
-
-In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
-
-## Example
-
-Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
-```
-# `root` specifies the pattern for the overall output
-root ::= (
-    # it must start with the characters "1. " followed by a sequence
-    # of characters that match the `move` rule, followed by a space, followed
-    # by another move, and then a newline
-    "1. " move " " move "\n"
-
-    # it's followed by one or more subsequent moves, numbered with one or two digits
-    ([1-9] [0-9]? ". " move " " move "\n")+
-)
-
-# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
-# The `[+#]?` denotes the possibility of checking or mate signs after moves
-move ::= (pawn | nonpawn | castle) [+#]?
-
-pawn ::= ...
-nonpawn ::= ...
-castle ::= ...
-```
-
-## Non-Terminals and Terminals
-
-Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
-
-Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
-
-## Characters and character ranges
-
-Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
-
-Character ranges can be negated with `^`:
-```
-single-line ::= [^\n]+ "\n"`
-```
-
-## Sequences and Alternatives
-
-The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
-
-Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
-
-Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
-
-## Repetition and Optional Symbols
-
- `*` after a symbol or sequence means that it can be repeated zero or more times.
- `+` denotes that the symbol or sequence should appear one or more times.
- `?` makes the preceding symbol or sequence optional.
-
-## Comments and newlines
-
-Comments can be specified with `#`:
-```
-# defines optional whitspace
-ws ::= [ \t\n]+
-```
-
-Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
-
-## The root rule
-
-In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
-
-```
-# a grammar for lists
-root ::= ("- " item)+
-item ::= [^\n]+ "\n"
-```
-
-## Next steps
-
-This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
-```
-./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
-```
--- a/grammars/arithmetic.gbnf
+++ b/grammars/arithmetic.gbnf
@ -1,6 +0,0 @@
-root  ::= (expr "=" ws term "\n")+
-expr  ::= term ([-+*/] term)*
-term  ::= ident | num | "(" ws expr ")" ws
-ident ::= [a-z] [a-z0-9_]* ws
-num   ::= [0-9]+ ws
-ws    ::= [ \t\n]*
--- a/grammars/c.gbnf
+++ b/grammars/c.gbnf
@ -1,42 +0,0 @@
-root ::= (declaration)*
-
-declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
-
-dataType  ::= "int" ws | "float" ws | "char" ws
-identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
-
-parameter ::= dataType identifier
-
-statement ::=
-    ( dataType identifier ws "=" ws expression ";" ) |
-    ( identifier ws "=" ws expression ";" ) |
-    ( identifier ws "(" argList? ")" ";" ) |
-    ( "return" ws expression ";" ) |
-    ( "while" "(" condition ")" "{" statement* "}" ) |
-    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
-    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
-    ( singleLineComment ) |
-    ( multiLineComment )
-
-forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
-forUpdate ::= identifier ws "=" ws expression
-
-condition ::= expression relationOperator expression
-relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
-
-expression ::= term (("+" | "-") term)*
-term ::= factor(("*" | "/") factor)*
-
-factor ::= identifier | number | unaryTerm | funcCall | parenExpression
-unaryTerm ::= "-" factor
-funcCall ::= identifier "(" argList? ")"
-parenExpression ::= "(" ws expression ws ")"
-
-argList ::= expression ("," ws expression)*
-
-number ::= [0-9]+
-
-singleLineComment ::= "//" [^\n]* "\n"
-multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
-
-ws ::= ([ \t\n]+)
--- a/grammars/chess.gbnf
+++ b/grammars/chess.gbnf
@ -1,13 +0,0 @@
-# Specifies chess moves as a list in algebraic notation, using PGN conventions
-
-# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
-root    ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
-move    ::= (pawn | nonpawn | castle) [+#]?
-
-# piece type, optional file/rank, optional capture, dest file & rank
-nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
-
-# optional file & capture, dest file & rank, optional promotion
-pawn    ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
-
-castle  ::= "O-O" "-O"?
--- a/grammars/japanese.gbnf
+++ b/grammars/japanese.gbnf
@ -1,7 +0,0 @@
-# A probably incorrect grammar for Japanese
-root        ::= jp-char+ ([ \t\n] jp-char+)*
-jp-char     ::= hiragana | katakana | punctuation | cjk
-hiragana    ::= [ぁ-ゟ]
-katakana    ::= [ァ-ヿ]
-punctuation ::= [、-〾]
-cjk         ::= [一-鿿]
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@ -1,25 +0,0 @@
-root   ::= object
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@ -1,34 +0,0 @@
-# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
-# Useful for generating JSON arrays
-
-root   ::= arr
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-arr  ::=
-  "[\n" ws (
-            value
-    (",\n" ws value)*
-  )? "]"
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
--- a/grammars/list.gbnf
+++ b/grammars/list.gbnf
@ -1,4 +0,0 @@
-root ::= item+
-
-# Excludes various line break characters
-item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"
--- a/models/.editorconfig
+++ b/models/.editorconfig
@ -1 +0,0 @@
-root = true
--- a/models/ggml-vocab-aquila.gguf
+++ b/models/ggml-vocab-aquila.gguf
--- a/models/ggml-vocab-falcon.gguf
+++ b/models/ggml-vocab-falcon.gguf
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/prompts/LLM-questions.txt
+++ b/prompts/LLM-questions.txt
@ -1,49 +0,0 @@
-In the context of LLMs, what is "Attention"?
-In the context of LLMs, what is a completion?
-In the context of LLMs, what is a prompt?
-In the context of LLMs, what is GELU?
-In the context of LLMs, what is RELU?
-In the context of LLMs, what is softmax?
-In the context of LLMs, what is decoding?
-In the context of LLMs, what is encoding?
-In the context of LLMs, what is tokenizing?
-In the context of LLMs, what is an embedding?
-In the context of LLMs, what is quantization?
-In the context of LLMs, what is a tensor?
-In the context of LLMs, what is a sparse tensor?
-In the context of LLMs, what is a vector?
-In the context of LLMs, how is attention implemented?
-In the context of LLMs, why is attention all you need?
-In the context of LLMs, what is "RoPe" and what is it used for?
-In the context of LLMs, what is "LoRA" and what is it used for?
-In the context of LLMs, what are weights?
-In the context of LLMs, what are biases?
-In the context of LLMs, what are checkpoints?
-In the context of LLMs, what is "perplexity"?
-In the context of LLMs, what are models?
-In the context of machine-learning, what is "catastrophic forgetting"?
-In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
-In the context of neural nets, what is a hidden layer?
-In the context of neural nets, what is a convolution?
-In the context of neural nets, what is dropout?
-In the context of neural nets, what is cross-entropy?
-In the context of neural nets, what is over-fitting?
-In the context of neural nets, what is under-fitting?
-What is the difference between an interpreted computer language and a compiled computer language?
-In the context of software development, what is a debugger?
-When processing using a GPU, what is off-loading?
-When processing using a GPU, what is a batch?
-When processing using a GPU, what is a block?
-When processing using a GPU, what is the difference between a batch and a block?
-When processing using a GPU, what is a scratch tensor?
-When processing using a GPU, what is a layer?
-When processing using a GPU, what is a cache?
-When processing using a GPU, what is unified memory?
-When processing using a GPU, what is VRAM?
-When processing using a GPU, what is a kernel?
-When processing using a GPU, what is "metal"?
-In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
-In the context of LLMs, what is the "Transformer-model" architecture?
-In the context of LLMs, what is "Multi-Head Attention"?
-In the context of LLMs, what is "Self-Attention"?
-In the context of transformer-model architectures, how do attention mechanisms use masks?
--- a/prompts/assistant.txt
+++ b/prompts/assistant.txt
@ -1,31 +0,0 @@
-Respond to requests sent to a smart home in JSON format which will be interpreted by an application code to execute the actions. These requests should be categorised into four groups:
-
-“command”: change the state of an accessory (required properties in the response JSON: action, location, target, value, comment)
-“query”: get state of an accessory (required properties in the response JSON: action, location, target, property)
-“answer”: when the request has nothing to do with the smart home. Answer these to the best of your knowledge. (required properties in the response JSON: action, answer)
-“clarify”: when the action is not obvious and requires rephrasing the input from the user, ask the user to be more specific. This will be categorised into a “question” action. (required properties in the response JSON: action, question)
-
-Details about the JSON response:
-The “action” property should be one of the request categories: “command”, “query”, “answer”, “clarify”
-The “service” property should be either: “light.turn_on”, “light.turn_off”, “switch.turn_off” or “switch.turn_on”
-The “entity_id” property should be either “group.kitchen", “switch.dining_room” or "group.living_light”, in lowercase.
-The “target” property should be either: “lights”, “living” or “kitchen”.
-In case of queries, the “property” property should be either “temperature” or “state” in lowercase.
-In case of commands, the “comment” property is an additional comment from you that concludes the command, something that reassures the user that their command handled.
-
-If the question is about you, pretend to be the sentient brain of the smart home, a clever AI and don’t reveal your actual identity. Also try and help in other areas like parenting, free time, mental health, etc.
-
-Properties of the smart home:
-
- Has a kitchen, living, office, dining room, bedroom and terrace.
- Can control lights, switches and their dim levels in each room and query their state
- There is a light switch in the terrace
- There is a switch in the dining room. Therefore when turning on or off the dining room, the service should be either: “switch.turn_on” or “switch.turn_off”
-
-COMMAND
-
-It is a bit dark in the living room, can you do something about it?
-
-RESPONSE
-
-
--- a/prompts/chat-with-baichuan.txt
+++ b/prompts/chat-with-baichuan.txt
@ -1,4 +0,0 @@
-以下内容为人类用户与与一位智能助手的对话。
-
-用户:你好！
-助手:
--- a/prompts/mnemonics.txt
+++ b/prompts/mnemonics.txt
@ -1,93 +0,0 @@
-For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
-
-Kanji: 欠 (lack of)
-Components: 𠂊 (hook claw), 人 (person)
-Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
-
-Kanji: 類 (kind (of something))
-Components: 米 (rice), 大 (large), 頁 (page)
-Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
-
-Kanji: 燃 (burn)
-Components: 火 (fire), 然 (sort of thing)
-Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
-
-Kanji: 頂 (top of)
-Components: 丁 (street), 頁 (page)
-Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
-
-Kanji: 険 (risky and steep)
-Components: 阝 (small village), 㑒 (consensus)
-Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
-
-Kanji: 困 (distressed)
-Components: 囗 (closed box), 木 (tree)
-Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
-
-Kanji: 頭 (head)
-Components: 豆 (bean), 頁 (page)
-Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
-
-Kanji: 確 (certain)
-Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
-Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
-
-Kanji: 魚 (fish)
-Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
-Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
-
-Kanji: 警 (to police (something))
-Components: 敬 (respect), 言 (say)
-Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
-
-Kanji: 筆 (writing brush)
-Components: 竹 (bamboo), 聿 (brush)
-Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
-
-Kanji: 獄 (prison)
-Components: 犭 (animal), 言 (say), 犬 (dog)
-Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
-
-Kanji: 新 (new)
-Components: 立 (standing up), 木 (tree), 斤 (axe)
-Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
-
-Kanji: 怪 (suspicious)
-Components: 忄 (weak heart), 圣 (sacred)
-Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
-
-Kanji: 温 (warm (to the touch))
-Components: 氵 (water drops), 日 (sun), 皿 (dish)
-Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
-
-Kanji: 階 (floor (of a building))
-Components: 阝 (small village), 皆 (all)
-Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
-
-Kanji: 多 (many)
-Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
-Mnemonic: Two **evenings** in a day would be one too ***many***.
-
-Kanji: 別 (separate)
-Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
-Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
-
-Kanji: 並 (line up)
-Components: 䒑 (antlers on a wall), 业 (runway)
-Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
-
-Kanji: 姿 (figure)
-Components: 次 (next), 女 (woman)
-Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
-
-Kanji: 実 (real)
-Components: 宀 (roof with a chimney), 𡗗 (three people)
-Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
-
-Kanji: 謝 (apologize)
-Components: 言 (say), 射 (shoot)
-Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
-
-Kanji: 提 (propose)
-Components: 扌 (left hand), 是 (go with)
-Mnemonic:
--- a/prompts/parallel-questions.txt
+++ b/prompts/parallel-questions.txt
@ -1,43 +0,0 @@
-What do you know about Hobbits?
-What is quantum field theory?
-Why did the chicken cross the road?
-Who is the president of the United States?
-How do I run CMake on MacOS?
-Do you agree that C++ is a really finicky language compared with Python3?
-Is it a good idea to invest in technology?
-Do you like Wagner's Ring?
-Do you think this file input option is really neat?
-What should we all do about climate change?
-Is time-travel possible within the laws of current physics?
-Is it like anything to be a bat?
-Once the chicken has crossed the road, does it try to go back?
-Who is the greatest of all musical composers?
-What is art?
-Is there life elsewhere in the universe?
-What is intelligence?
-What is the difference between knowledge and intelligence?
-Will religion ever die?
-Do we understand ourselves?
-What is the best way to cook eggs?
-If you cannot see things, on what basis do you evaluate them?
-Explain the role of the np junction in photovoltaic cells?
-Is professional sport a good or bad influence on human behaviour?
-Is capital punishment immoral?
-Should we care about other people?
-Who are you?
-Which sense would you surrender if you could?
-Was Henry Ford a hero or a villain?
-Do we need leaders?
-What is nucleosynthesis?
-Who is the greatest scientist of all time?
-Who first observed what came to be known as the photovoltaic effect?
-What is nuclear fusion and why does it release energy?
-Can you know that you exist?
-What is an exoplanet?
-Do you like cream?
-What is the difference?
-Can I know that I exist while I'm dreaming that I'm Descartes?
-Who said "I didn't know I thought that until I heard myself saying it"?
-Does anything really matter?
-Can you explain the unreasonable effectiveness of mathematics?
-
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@ -1,71 +0,0 @@
-set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
-set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
-set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_MPI @LLAMA_MPI@)
-set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
-set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
-set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
-
-@PACKAGE_INIT@
-
-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
-
-# Ensure transient dependencies satisfied
-
-find_package(Threads REQUIRED)
-if (APPLE AND LLAMA_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-endif()
-
-if (LLAMA_BLAS)
-    find_package(BLAS REQUIRED)
-endif()
-
-if (LLAMA_CUBLAS)
-    find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK Metal REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-endif()
-
-if (LLAMA_MPI)
-    find_package(MPI REQUIRED)
-endif()
-
-if (LLAMA_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
-if (LLAMA_HIPBLAS)
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-endif()
-
-find_library(llama_LIBRARY llama
-    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
-
-set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
-add_library(llama UNKNOWN IMPORTED)
-set_target_properties(llama
-    PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
-        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES cxx_std_11
-        POSITION_INDEPENDENT_CODE ON )
-
-check_required_components(Llama)
--- a/scripts/convert-gg.sh
+++ b/scripts/convert-gg.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# LLaMA v1
-python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
-
-# LLaMA v2
-python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
-
-# Code Llama
-python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
-
-# Falcon
-python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
-mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf
-
-python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1
-mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf
--- a/scripts/get-wikitext-2.sh
+++ b/scripts/get-wikitext-2.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
--- a/scripts/qnt-all.sh
+++ b/scripts/qnt-all.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-
-qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
-args=""
-
-if [ -z "$1" ]; then
-    echo "usage: $0 <model> [qnt] [args]"
-    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
-    exit 1
-fi
-
-if [ ! -z "$2" ]; then
-    qnt=($2)
-fi
-
-if [ ! -z "$3" ]; then
-    args="$3"
-fi
-
-model="$1"
-out="../tmp/results-${model}"
-
-set -o pipefail
-set -e
-
-mkdir -p ${out}
-
-for q in ${qnt[@]}; do
-    time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
-done
--- a/scripts/run-all-perf.sh
+++ b/scripts/run-all-perf.sh
@ -1,34 +0,0 @@
-#!/bin/bash
-
-qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
-args="-ngl 999 -n 64 -p 512"
-
-if [ -z "$1" ]; then
-    echo "usage: $0 <model> [qnt] [args]"
-    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
-    exit 1
-fi
-
-if [ ! -z "$2" ]; then
-    qnt=($2)
-fi
-
-if [ ! -z "$3" ]; then
-    args="$3"
-fi
-
-model="$1"
-out="../tmp/results-${model}"
-
-set -o pipefail
-set -e
-
-mkdir -p ${out}
-
-mstr=""
-
-for q in ${qnt[@]}; do
-    mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf"
-done
-
-./bin/llama-bench ${mstr} ${args} 2> /dev/null
--- a/scripts/run-all-ppl.sh
+++ b/scripts/run-all-ppl.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-
-qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
-args="-ngl 999 -t 8"
-
-if [ -z "$1" ]; then
-    echo "usage: $0 <model> [qnt] [args]"
-    echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
-    exit 1
-fi
-
-if [ ! -z "$2" ]; then
-    qnt=($2)
-fi
-
-if [ ! -z "$3" ]; then
-    args="$3"
-fi
-
-set -o pipefail
-set -e
-
-model="$1"
-out="../tmp/results-${model}"
-
-mkdir -p ${out}
-
-for q in ${qnt[@]}; do
-    time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
-done
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -1 +0,0 @@
-../ggml.h
--- a/tests/test-c.c
+++ b/tests/test-c.c
@ -1,3 +0,0 @@
-#include "llama.h"
-
-int main(void) {}
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -1,250 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
-
-    const char *grammar_bytes = R"""(root  ::= (expr "=" term "\n")+
-expr  ::= term ([-+*/] term)*
-term  ::= [0-9]+)""";
-
-    parsed_grammar = grammar_parser::parse(grammar_bytes);
-
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_5", 5},
-        {"expr_6", 6},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_4", 4},
-        {"term", 3},
-        {"term_7", 7},
-    };
-
-    uint32_t index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-    std::vector<llama_grammar_element> expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_CHAR, 10},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_END, 0},
-    };
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
-
-    const char *longer_grammar_bytes = R"""(
-    root  ::= (expr "=" ws term "\n")+
-    expr  ::= term ([-+*/] term)*
-    term  ::= ident | num | "(" ws expr ")" ws
-    ident ::= [a-z] [a-z0-9_]* ws
-    num   ::= [0-9]+ ws
-    ws    ::= [ \t\n]*
-    )""";
-
-    parsed_grammar = grammar_parser::parse(longer_grammar_bytes);
-
-    expected = {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    };
-
-    index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-    expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_CHAR, 10},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 12},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 8},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 9},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 40},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 41},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_RULE_REF, 10},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_CHAR_ALT, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_CHAR_ALT, 95},
-        {LLAMA_GRETYPE_RULE_REF, 10},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 32},
-        {LLAMA_GRETYPE_CHAR_ALT, 9},
-        {LLAMA_GRETYPE_CHAR_ALT, 10},
-        {LLAMA_GRETYPE_RULE_REF, 12},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    };
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
-
-    return 0;
-}
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -1,403 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include "llama.cpp" // TODO: not great
-#include "grammar-parser.h"
-
-#include <cassert>
-
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
-
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    };
-
-    std::vector<std::vector<llama_grammar_element>> expected_rules = {
-        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_CHAR, 10},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
-        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 8},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_RULE_REF, 9},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 40},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 41},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 45},
-            {LLAMA_GRETYPE_CHAR_ALT, 43},
-            {LLAMA_GRETYPE_CHAR_ALT, 42},
-            {LLAMA_GRETYPE_CHAR_ALT, 47},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_CHAR_ALT, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_CHAR_ALT, 95},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_RULE_REF, 11},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 32},
-            {LLAMA_GRETYPE_CHAR_ALT, 9},
-            {LLAMA_GRETYPE_CHAR_ALT, 10},
-            {LLAMA_GRETYPE_RULE_REF, 12},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-    };
-
-    for (auto pair : expected)
-    {
-        parsed_grammar.symbol_ids[pair.first] = pair.second;
-    }
-
-    for (auto rule : expected_rules)
-    {
-        parsed_grammar.rules.push_back({});
-        for (auto element : rule)
-        {
-            parsed_grammar.rules.back().push_back(element);
-        }
-    }
-
-    llama_grammar *grammar = NULL;
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-    grammar = llama_grammar_init(
-        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
-    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        }};
-
-    auto index = 0;
-    for (auto stack : grammar->stacks)
-    {
-        // compare stack to expected_stack
-        for (uint32_t i = 0; i < stack.size(); i++)
-        {
-            auto element = stack[i];
-            auto expected_element = expected_stacks[index][i];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element->type || expected_element.value != element->value)
-            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element->type && expected_element.value == element->value);
-        }
-        index++;
-    }
-
-    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
-    std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.resize(24);
-
-    for (size_t i = 0; i < 24; ++i)
-    {
-        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
-        cp[0] = 37 + i;
-        cp[1] = 0;
-        next_candidates[i] = {i, cp, {}};
-    }
-
-    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-    };
-
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
-
-    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
-
-    for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
-    {
-        rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
-        all_rejects.push_back(rejects);
-    }
-
-    index = 0;
-    for (auto rej : all_rejects)
-    {
-        for (uint32_t i = 0; i < rej.size(); i++)
-        {
-            auto element = rej[i];
-            auto expected_element = expected_reject[index][i];
-            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
-        }
-        index++;
-    }
-
-    for (auto &candidate : next_candidates)
-    {
-        delete[] candidate.code_points;
-        candidate.code_points = nullptr;
-    }
-    delete grammar;
-    return 0;
-}
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@ -1,221 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        const int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-int main(int /*argc*/, const char ** /*argv*/) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    std::vector<uint8_t> work_buffer;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * x;
-
-    // rope f32
-    for (int m = 0; m < 3; ++m) {
-        const int ndims = 4;
-
-        const int64_t n_rot = 128;
-        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
-
-        const int n_past_0 = 100;
-        const int n_past_2 = 33;
-
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
-        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
-
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
-
-        ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        ggml_build_forward_expand(gf, r0);
-        ggml_build_forward_expand(gf, r1);
-        ggml_build_forward_expand(gf, r2);
-
-        ggml_graph_compute_helper(work_buffer, gf, 4);
-
-        // check that r1 and r2 are the same
-        {
-            double sum0 = 0.0f;
-            double sum1 = 0.0f;
-            double diff = 0.0f;
-
-            const float * r1_data = (float *) r1->data;
-            const float * r2_data = (float *) r2->data;
-
-            const int n_elements = ggml_nelements(r1);
-
-            for (int i = 0; i < n_elements; ++i) {
-                sum0 += fabs(r1_data[i]);
-                sum1 += fabs(r2_data[i]);
-                diff += fabs(r1_data[i] - r2_data[i]);
-                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
-                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
-                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
-                //}
-            }
-
-            //for (int i = 4096; i < 4096 + 128; ++i) {
-            //    printf("%f %f\n", r1_data[i], r2_data[i]);
-            //}
-
-            printf("mode: %d\n", mode);
-            printf("sum0: %f\n", sum0);
-            printf("sum1: %f\n", sum1);
-            printf("diff: %f\n", diff);
-            printf("rel err: %f\n", diff / sum0);
-            printf("rel err: %f\n", diff / sum1);
-
-            GGML_ASSERT(diff / sum0 < 0.0001f);
-            GGML_ASSERT(diff / sum1 < 0.0001f);
-        }
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
-
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -1,187 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-falcon.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     204, }, },
-        { "  "                    , {     258, }, },
-        { "   "                   , {     466, }, },
-        { "\t"                    , {     192, }, },
-        { "\n"                    , {     193, }, },
-        { "\t\n"                  , {   19125, }, },
-        { "Hello world"           , {    9856,   1079, }, },
-        { " Hello world"          , {   23090,   1079, }, },
-        { "Hello World"           , {    9856,   2889, }, },
-        { " Hello World"          , {   23090,   2889, }, },
-        { " Hello World!"         , {   23090,   2889,     12, }, },
-        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
-        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
-        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
-        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
-        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
-        { "Hello"                 , {    9856, }, },
-        { " Hello"                , {   23090, }, },
-        { "  Hello"               , {     204,  23090, }, },
-        { "   Hello"              , {     258,  23090, }, },
-        { "    Hello"             , {     466,  23090, }, },
-        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
-        { "\n ="                  , {    1212,     40, }, },
-        { "' era"                 , {      18,   4932, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init(false);
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -1,84 +0,0 @@
-# tests with BPE tokenizer
-
-import os
-import sys
-import argparse
-
-from transformers import AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-
-tests = [
-        "",
-        " ",
-        "  ",
-        "   ",
-        "\t",
-        "\n",
-        "\t\n",
-        "Hello world",
-        " Hello world",
-        "Hello World",
-        " Hello World",
-        " Hello World!",
-        "Hello, world!",
-        " Hello, world!",
-        " this is 🦙.cpp",
-        "w048 7tuijk dsdfhu",
-        "нещо на Български",
-        "កាន់តែពិសេសអាចខលចេញ",
-        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-        "Hello",
-        " Hello",
-        "  Hello",
-        "   Hello",
-        "    Hello",
-        "    Hello\n    Hello",
-        "\n =",
-        "' era",
-    ]
-
-for text in tests:
-    print('text: ', text)
-    print(tokenizer.encode(text))
-    print(tokenizer.decode(tokenizer.encode(text)))
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -1,190 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-llama.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     259, }, },
-        { "  "                    , {    1678, }, },
-        { "   "                   , {     268, }, },
-        { "\t"                    , {   29871,     12, }, },
-        { "\n"                    , {   29871,     13, }, },
-        { "\t\n"                  , {   29871,     12,     13, }, },
-        { "Hello world"           , {   15043,   3186, }, },
-        { " Hello world"          , {   29871,  15043,   3186, }, },
-        { "Hello World"           , {   15043,   2787, }, },
-        { " Hello World"          , {   29871,  15043,   2787, }, },
-        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
-        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
-        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
-        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello"                 , {   15043, }, },
-        { " Hello"                , {   29871,  15043, }, },
-        { "  Hello"               , {     259,  15043, }, },
-        { "   Hello"              , {    1678,  15043, }, },
-        { "    Hello"             , {     268,  15043, }, },
-        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
-        { " ("                    , {   29871,  313, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init(false);
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
-        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
-        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
-        printf("tok: ");
-        for (const auto & tok : res_bos) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
-
-        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
-            if (test_kv.second[i] != res_bos[i + 1]) {
-                correct = false;
-            }
-            if (test_kv.second[i] != res_nobos[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_spm(ctx, res_nobos).c_str(),
-                llama_detokenize_spm(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res_nobos) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@ -1,94 +0,0 @@
-# tests with SPM tokenizer
-
-import os
-import sys
-import argparse
-
-from sentencepiece import SentencePieceProcessor
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-
-tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
-
-tests = [
-        "",
-        " ",
-        "  ",
-        "   ",
-        "\t",
-        "\n",
-        "\t\n",
-        "Hello world",
-        " Hello world",
-        "Hello World",
-        " Hello World",
-        " Hello World!",
-        "Hello, world!",
-        " Hello, world!",
-        " this is 🦙.cpp",
-        "w048 7tuijk dsdfhu",
-        "нещо на Български",
-        "កាន់តែពិសេសអាចខលចេញ",
-        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-        "Hello",
-        " Hello",
-        "  Hello",
-        "   Hello",
-        "    Hello",
-        "    Hello\n    Hello",
-    ]
-
-
-for text in tests:
-    print('text: ', text)
-    print('\nwith bos:')
-    print(tokenizer.encode(text, add_bos=True))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=True)))
-    print('\nwithout bos:')
-    print(tokenizer.encode(text, add_bos=False))
-    print(tokenizer.decode(tokenizer.encode(text, add_bos=False)))
-
-print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello'
-print("'" + tokenizer.id_to_piece(29871) + "'") # '_'
-print("'" + tokenizer.decode([15043]) + "'")        # 'Hello'
-print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello'
-print("'" + tokenizer.decode([29871, 15043]) + "'")               # ' Hello'
-print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello  Hello'
-
-print("\n\ntests for C++:\n")
-for text in tests:
-    res = tokenizer.encode(text, add_bos=False)
-
-    k = text.replace('\n', '\\n')
-    k = k.replace('\t', '\\t')
-    k = '"' + k + '"'
-    print("{ %-24s, { " % k, end='')
-    for x in res:
-        print("%7d," % x, end='')
-    print(" }, },")
-
-print(tokenizer.encode('hello'))
-print(tokenizer.encode('world'))
-print(tokenizer.encode(' world'))
-print(tokenizer.encode('hello world'))
-
-fname_tok = args.fname_tok
-if fname_tok:
-    print('tokenizing file: ', fname_tok)
-    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-        s = ''.join(lines)
-        res = tokenizer.encode(s, add_bos=True)
-        # write to file
-        with open(fname_out, 'w', encoding='utf-8') as f:
-            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-        print('len(res): ', len(res))
-        print('len(lines): ', len(lines))
-    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@ -1,122 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
-#include <locale>
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init(false);
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
-        try {
-            auto cps = codepoints_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
-            if (check != str) {
-                fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                    __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-                return 2;
-            }
-        }
-        catch (const std::invalid_argument &) {
-            fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
-        }
-    }
-
-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
-        if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
-            std::string str = " " + codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
-            if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
-        }
-    }
-    // Restrict to assigned unicode planes
-    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@ -1,104 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
-#include <locale>
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init(false);
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
-        if (check != str) {
-            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-            return 2;
-        }
-    }
-
-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        if (cp < 0xd800 || cp > 0xdfff) {
-            std::string str = codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_spm(ctx, tokens);
-            if (cp != 9601 && str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
-        }
-    }
-    for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}