Merge pull request #4 from zihaoccc/cleanup1

remove ci
2024-07-26 16:35:22 -07:00 · 2024-07-26 16:35:22 -07:00 · 0e5165b605
commit 0e5165b605
parent cd78f93710 b76557d7c6
52 changed files with 0 additions and 4783 deletions
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -1,22 +0,0 @@
 node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
    stage('Cleanup'){
        cleanWs()               // Cleaning previous CI build in workspace
    }
    stage('checkout repo'){
        retry(5){               // Retry if the cloning fails due to some reason
            checkout scm        // Clone the repo on Runner
        }
    }
    stage('Compiling llama.cpp'){
        sh'''#!/bin/bash
            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
        '''
    }
    stage('Running llama.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
            cat llama_log.txt                   # Printing results
        '''
    }
 }
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -1,36 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -1,50 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev
 RUN make -j$(nproc)
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,25 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 ENV LLAMA_CURL=1
 RUN make -j$(nproc)
 ENV LC_ALL=C.utf8
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -1,35 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV GGML_CUDA=1
 RUN make -j$(nproc) llama-cli
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/llama-cli /llama-cli
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,28 +0,0 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with static libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make -j$(nproc) llama-cli
 ENTRYPOINT [ "/app/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,27 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-cli /llama-cli && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,23 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make -j$(nproc) llama-cli
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
 COPY --from=build /app/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@ -1,83 +0,0 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
 Requires:       cuda-toolkit
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j GGML_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
 cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llama-cuda-cli
 %{_bindir}/llama-cuda-server
 %{_bindir}/llama-cuda-simple
 /usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -1,85 +0,0 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 #    In the meantime, YYYYMMDD format will be used.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
 Requires:       libstdc++
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 Models are not included in this package and must be downloaded separately.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
 cp -p llama-server %{buildroot}%{_bindir}/llama-server
 cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llama-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llama-cli
 %{_bindir}/llama-server
 %{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -1,39 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV GGML_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN make -j$(nproc) llama-server
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
 COPY --from=build /app/llama-server /llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,32 +0,0 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 COPY --from=build /app/build/bin/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -1,52 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 # Enable cURL
 ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
 RUN make -j$(nproc) llama-server
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,29 +0,0 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/llama-server /llama-server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,27 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev curl
 WORKDIR /app
 COPY . .
 ENV LLAMA_CURL=1
 RUN make -j$(nproc) llama-server
 FROM ubuntu:$UBUNTU_VERSION AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
 COPY --from=build /app/llama-server /llama-server
 ENV LC_ALL=C.utf8
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/llama-server" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -1,21 +0,0 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      apps =
        let
          inherit (config.packages) default;
          binaries = [
            "llama-cli"
            "llama-embedding"
            "llama-server"
            "llama-quantize"
          ];
          mkApp = name: {
            type = "app";
            program = "${default}/bin/${name}";
          };
        in
        lib.genAttrs binaries mkApp;
    };
 }
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -1,13 +0,0 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      devShells =
        lib.concatMapAttrs
          (name: package: {
            ${name} = package.passthru.shell;
            ${name + "-extra"} = package.passthru.shell-extra;
          })
          config.packages;
    };
 }
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@ -1,37 +0,0 @@
 {
  lib,
  dockerTools,
  buildEnv,
  llama-cpp,
  interactive ? true,
  coreutils,
 }:
 # A tar that can be fed into `docker load`:
 #
 # $ nix build .#llamaPackages.docker
 # $ docker load < result
 # For details and variations cf.
 # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
 # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
 # - https://nixery.dev/
 # Approximate (compressed) sizes, at the time of writing, are:
 #
 # .#llamaPackages.docker: 125M;
 # .#llamaPackagesCuda.docker: 537M;
 # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
 dockerTools.buildLayeredImage {
  name = llama-cpp.pname;
  tag = "latest";
  contents =
    [ llama-cpp ]
    ++ lib.optionals interactive [
      coreutils
      dockerTools.binSh
      dockerTools.caCertificates
    ];
 }
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -1,39 +0,0 @@
 { inputs, ... }:
 {
  perSystem =
    {
      config,
      system,
      lib,
      pkgsCuda,
      ...
    }:
    {
      legacyPackages =
        let
          caps.llamaPackagesXavier = "7.2";
          caps.llamaPackagesOrin = "8.7";
          caps.llamaPackagesTX2 = "6.2";
          caps.llamaPackagesNano = "5.3";
          pkgsFor =
            cap:
            import inputs.nixpkgs {
              inherit system;
              config = {
                cudaSupport = true;
                cudaCapabilities = [ cap ];
                cudaEnableForwardCompat = false;
                inherit (pkgsCuda.config) allowUnfreePredicate;
              };
            };
        in
        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
      packages = lib.optionalAttrs (system == "aarch64-linux") {
        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
      };
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -1,47 +0,0 @@
 { inputs, ... }:
 {
  # The _module.args definitions are passed on to modules as arguments. E.g.
  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
  # `_module.args.pkgs` (defined in this case by flake-parts).
  perSystem =
    { system, ... }:
    {
      _module.args = {
        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
        # again, the below creates several nixpkgs instances which the
        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
        #
        # This is currently "slow" and "expensive", on a certain scale.
        # This also isn't "right" in that this hinders dependency injection at
        # the level of flake inputs. This might get removed in the foreseeable
        # future.
        #
        # Note that you can use these expressions without Nix
        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
          # and ucx are built with CUDA support)
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
            builtins.all
              (
                license:
                license.free
                || builtins.elem license.shortName [
                  "CUDA EULA"
                  "cuDNN EULA"
                ]
              )
              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
          inherit system;
          config.rocmSupport = true;
        };
      };
    };
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -1,331 +0,0 @@
 {
  lib,
  glibc,
  config,
  stdenv,
  mkShell,
  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
  blas,
  cudaPackages,
  darwin,
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
  curl,
  shaderc,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useRocm ? config.rocmSupport,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false
 }@inputs:
 let
  inherit (lib)
    cmakeBool
    cmakeFeature
    optionals
    strings
    versionOlder
    ;
  stdenv = throw "Use effectiveStdenv instead";
  suffices =
    lib.optionals useBlas [ "BLAS" ]
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
  descriptionSuffix =
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";
  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
  #
  # TODO: Package up each Python script or service appropriately, by making
  # them into "entrypoints"
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
    ]
  );
  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
  llama-python-extra = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
      # server bench
      ps.matplotlib
      # server tests
      ps.openai
      ps.behave
      ps.prometheus-client
      # for examples/pydantic-models-to-grammar-examples.py
      ps.docstring-parser
      ps.pydantic
      # for scripts/compare-llama-bench.py
      ps.gitpython
      ps.tabulate
    ]
  );
  xcrunHost = runCommand "xcrunHost" {} ''
    mkdir -p $out/bin
    ln -s /usr/bin/xcrun $out/bin
  '';
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
    with darwin.apple_sdk.frameworks;
    [
      Accelerate
      CoreVideo
      CoreGraphics
    ]
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
    cuda_cccl.dev # <nv/target>
    # A temporary hack for reducing the closure size, remove once cudaPackages
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
    clr
    hipblas
    rocblas
  ];
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
    shaderc
  ];
 in
 effectiveStdenv.mkDerivation (
  finalAttrs: {
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    # Note: none of the files discarded here are visible in the sandbox or
    # affect the output hash. This also means they can be modified without
    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
        let
          noneOf = builtins.all (x: !x);
          baseName = baseNameOf name;
        in
        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
          (lib.hasPrefix "." baseName) # Skip hidden files and directories
          (baseName == "flake.lock")
        ];
      src = lib.cleanSource ../../.;
    };
    postPatch = ''
      substituteInPlace ./ggml/src/ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      substituteInPlace ./ggml/src/ggml-metal.m \
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
    '';
    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
    # `default.metallib` may be compiled with Metal compiler from XCode
    # and we need to escape sandbox on MacOS to access Metal compiler.
    # `xcrun` is used find the path of the Metal compiler, which is varible
    # and not on $PATH
    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
    nativeBuildInputs =
      [
        cmake
        ninja
        pkg-config
        git
      ]
      ++ optionals useCuda [
        cudaPackages.cuda_nvcc
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ]
      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
        glibc.static
      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
        xcrunHost
      ];
    buildInputs =
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
      ++ optionals useVulkan vulkanBuildInputs
      ++ optionals enableCurl [ curl ];
    cmakeFlags =
      [
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_CURL" enableCurl)
        (cmakeBool "GGML_NATIVE" false)
        (cmakeBool "GGML_BLAS" useBlas)
        (cmakeBool "GGML_CUDA" useCuda)
        (cmakeBool "GGML_HIPBLAS" useRocm)
        (cmakeBool "GGML_METAL" useMetalKit)
        (cmakeBool "GGML_VULKAN" useVulkan)
        (cmakeBool "GGML_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
          with cudaPackages.flags;
          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
          )
        )
      ]
      ++ optionals useRocm [
        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
      ]
      ++ optionals useMetalKit [
        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
      ];
    # Environment variables needed for ROCm
    env = optionals useRocm {
      ROCM_PATH = "${rocmPackages.clr}";
      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
    };
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
      mkdir -p $out/include
      cp $src/include/llama.h $out/include/
    '';
    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
    passthru = {
      inherit
        useBlas
        useCuda
        useMetalKit
        useMpi
        useRocm
        useVulkan
        ;
      shell = mkShell {
        name = "shell-${finalAttrs.finalPackage.name}";
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
        shellHook = ''
          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
        '';
      };
      shell-extra = mkShell {
        name = "shell-extra-${finalAttrs.finalPackage.name}";
        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
        buildInputs = [ llama-python-extra ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
    };
    meta = {
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
      badPlatforms = optionals useCuda lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
      broken = (useMetalKit && !effectiveStdenv.isDarwin);
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
      license = lib.licenses.mit;
      # Accommodates `nix run` and `lib.getExe`
      mainProgram = "llama-cli";
      # These people might respond, on the best effort basis, if you ping them
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
      # Consider adding yourself to this list if you want to ensure this flake
      # stays maintained and you're willing to invest your time. Do not add
      # other people without their consent. Consider removing people after
      # they've been unreachable for long periods of time.
      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
      # an attrset following the same format as in
      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
      maintainers = with lib.maintainers; [
        philiptaron
        SomeoneSerge
      ];
      # Extend `badPlatforms` instead
      platforms = lib.platforms.all;
    };
  }
 )
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -1,19 +0,0 @@
 {
  lib,
  newScope,
  llamaVersion ? "0.0.0",
 }:
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
    docker = self.callPackage ./docker.nix { };
    docker-min = self.callPackage ./docker.nix { interactive = false; };
    sif = self.callPackage ./sif.nix { };
  }
 )
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@ -1,27 +0,0 @@
 {
  lib,
  singularity-tools,
  llama-cpp,
  bashInteractive,
  interactive ? false,
 }:
 let
  optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
  inherit (llama-cpp) name;
  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
  # These are excessive (but safe) for most variants. Building singularity
  # images requires superuser privileges, so we build them inside a VM in a
  # writable image of pre-determined size.
  #
  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
  #
  # Expected image sizes:
  # - cpu/blas: 150M,
  # - cuda, all gencodes: 560M,
  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
  memSize = diskSize;
 }
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -1,41 +0,0 @@
 #!/bin/bash
 set -e
 # Read the first argument into a variable
 arg1="$1"
 # Shift the arguments to remove the first one
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
    python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./llama-cli "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
    ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
 fi
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@ -1,50 +0,0 @@
 name: Low Severity Bugs
 description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
 title: "Bug: "
 labels: ["bug-unconfirmed", "low severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@ -1,50 +0,0 @@
 name: Medium Severity Bug
 description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
 title: "Bug: "
 labels: ["bug-unconfirmed", "medium severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@ -1,50 +0,0 @@
 name: High Severity Bug
 description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
 title: "Bug: "
 labels: ["bug-unconfirmed", "high severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@ -1,50 +0,0 @@
 name: Critical Severity Bug
 description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
 title: "Bug: "
 labels: ["bug-unconfirmed", "critical severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@ -1,51 +0,0 @@
 name: Enhancement
 description: Used to request enhancements for llama.cpp
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
  - type: markdown
    attributes:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
  - type: checkboxes
    id: prerequisites
    attributes:
      label: Prerequisites
      description: Please confirm the following before submitting your enhancement request.
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true
  - type: textarea
    id: feature-description
    attributes:
      label: Feature Description
      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
      placeholder: Detailed description of the enhancement
    validations:
      required: true
  - type: textarea
    id: motivation
    attributes:
      label: Motivation
      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
      placeholder: Explanation of why this feature is needed and its benefits
    validations:
      required: true
  - type: textarea
    id: possible-implementation
    attributes:
      label: Possible Implementation
      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
      placeholder: Detailed description of potential implementation
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@ -1,52 +0,0 @@
 name: Research
 description: Track new technical research area
 title: "Research: "
 labels: ["research 🔬"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  - type: checkboxes
    id: research-stage
    attributes:
      label: Research Stage
      description: Track general state of this research ticket
      options:
        - label: Background Research (Let's try to avoid reinventing the wheel)
        - label: Hypothesis Formed (How do you think this will work and it's effect?)
        - label: Strategy / Implementation Forming
        - label: Analysis of results
        - label: Debrief / Documentation (So people in the future can learn from us)
  - type: textarea
    id: background
    attributes:
      label: Previous existing literature and research
      description: Whats the current state of the art and whats the motivation for this research?
  - type: textarea
    id: hypothesis
    attributes:
      label: Hypothesis
      description: How do you think this will work and it's effect?
  - type: textarea
    id: implementation
    attributes:
      label: Implementation
      description: Got an approach? e.g. a PR ready to go?
  - type: textarea
    id: analysis
    attributes:
      label: Analysis
      description: How does the proposed implementation behave?
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@ -1,28 +0,0 @@
 name: Refactor (Maintainers)
 description: Used to track refactoring opportunities
 title: "Refactor: "
 labels: ["refactor"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
  - type: textarea
    id: background-description
    attributes:
      label: Background Description
      description: Please provide a detailed written description of the pain points you are trying to solve.
      placeholder: Detailed description behind your motivation to request refactor
    validations:
      required: true
  - type: textarea
    id: possible-approaches
    attributes:
      label: Possible Refactor Approaches
      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
      placeholder: Your idea of possible refactoring opportunity/approaches
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,11 +0,0 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -1,91 +0,0 @@
 # https://github.com/actions/labeler
 Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-kompute.h
            - ggml/src/ggml-kompute.cpp
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-metal.h
            - ggml/src/ggml-metal.cpp
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
            - ggml/src/ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/ggml_vk_generate_shaders.py
            - ggml/src/ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
            - docs/**
            - media/**
 testing:
    - changed-files:
        - any-glob-to-any-file:
            - tests/**
 build:
    - changed-files:
        - any-glob-to-any-file:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
 python:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.py"
            - requirements/**
            - gguf-py/**
            - .flake8
 script:
    - changed-files:
        - any-glob-to-any-file:
            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
 server:
    - changed-files:
        - any-glob-to-any-file:
            - examples/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml*.h
            - ggml/src/ggml*.c
            - ggml/src/ggml*.cpp
            - ggml/src/ggml*.h
            - ggml-cuda/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
 embedding:
    - changed-files:
        - any-glob-to-any-file: examples/embedding/
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,7 +0,0 @@
 - [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
 - Self-reported review complexity:
  - [ ] Low
  - [ ] Medium
  - [ ] High
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -1,310 +0,0 @@
 # Benchmark
 name: Benchmark
 on:
  workflow_dispatch:
    inputs:
      gpu-series:
        description: 'Azure GPU series to run with'
        required: true
        type: choice
        options:
          - Standard_NC4as_T4_v3
          - Standard_NC24ads_A100_v4
          - Standard_NC80adis_H100_v5
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      duration:
        description: 'Duration of the bench'
        type: string
        default: 10m
  push:
    branches:
      - master
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true
 jobs:
  bench-server-baseline:
    runs-on: Standard_NC4as_T4_v3
    env:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
    strategy:
      matrix:
        model: [phi-2]
        ftype: [q4_0, q8_0, f16]
        include:
          - model: phi-2
            ftype: q4_0
            pr_comment_enabled: "true"
    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
      || (
        github.event_name == 'schedule'
        && github.ref_name == 'master'
        && github.repository_owner == 'ggerganov'
      )
      || github.event_name == 'pull_request_target'
      || (
        github.event_name == 'push'
        && github.event.ref == 'refs/heads/master'
        && github.repository_owner == 'ggerganov'
      )
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Install python env
        id: pipenv
        run: |
          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
      - name: Prometheus
        id: install_prometheus
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.21'
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
          cd examples/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
      - name: Build
        id: cmake_build
        run: |
          set -eux
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
          cmake --build build --config Release -j $(nproc) --target llama-server
      - name: Download the dataset
        id: download_dataset
        run: |
          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
      - name: Server bench
        id: server_bench
        run: |
          set -eux
          cd examples/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
              --batch-size 2048 \
              --ubatch-size	256 \
              --ctx-size 16384 \
              --n-prompts 1000 \
              --max-prompt-tokens 1024 \
              --max-tokens 2048
          cat results.github.env >> $GITHUB_ENV
          # Remove dataset as we do not want it in the artefact
          rm ShareGPT_V3_unfiltered_cleaned_split.json
      - uses: actions/upload-artifact@v4
        with:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
            examples/server/bench/*.json
            examples/server/bench/*.log
      - name: Commit status
        uses: Sibz/github-status-action@v1
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'
      - name: Upload benchmark images
        uses: devicons/public-upload-to-imgur@v2.2.2
        continue-on-error: true # Important as it looks unstable: 503
        id: imgur_step
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
            examples/server/bench/prompt_tokens_seconds.jpg
            examples/server/bench/predicted_tokens_seconds.jpg
            examples/server/bench/kv_cache_usage_ratio.jpg
            examples/server/bench/requests_processing.jpg
      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux
          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
      - name: Extract image url
        id: extract_image_url
        continue-on-error: true
        run: |
          set -eux
          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
        with:
          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          message: |
            <p align="center">
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
            </p>
            <details>
            <summary>Expand details for performance related PR only</summary>
            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
            - ${{ env.BENCH_GRAPH_XLABEL }}
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
            <details>
            <summary>More</summary>
            ```mermaid
            ${{ env.PROMPT_TOKENS_SECONDS }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.PREDICTED_TOKENS_SECONDS }}
            ```
            </details>
            </p>
            <details>
            <summary>Details</summary>
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.KV_CACHE_USAGE_RATIO }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.REQUESTS_PROCESSING }}
            ```
            </details>
            </p>
            </details>
            </details>
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -1,23 +0,0 @@
 name: Close inactive issues
 on:
  schedule:
    - cron: "42 0 * * *"
 jobs:
  close-issues:
    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write
    steps:
      - uses: actions/stale@v5
        with:
          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
          operations-per-run: 10000
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -1,116 +0,0 @@
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 # GitHub recommends pinning actions to a commit SHA.
 # To get a newer version, you will need to update the SHA.
 # You can also reference a tag or branch, but the action may change without warning.
 name: Publish Docker image
 on:
  #pull_request:
  push:
    branches:
      - master
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      matrix:
        config:
          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: false
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Downcase github.repository_owner
        run: |
          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
      - name: Build and push Docker image (tagged)
        uses: docker/build-push-action@v4
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -1,27 +0,0 @@
 name: EditorConfig Checker
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
        description: 'Create new release'
        required: true
        type: boolean
  push:
    branches:
      - master
  pull_request:
    branches:
      - master
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -1,44 +0,0 @@
 # This workflow will upload a Python Package using Twine when a GGUF release is created
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 # See `gguf-py/README.md` for how to make a release.
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 name: Upload Python Package
 on:
  workflow_dispatch:
  push:
    # Pattern matched against refs/tags
    tags:
      - 'gguf-v*'           # Push events to every version tag
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
        python -m pip install poetry
        poetry install
    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -1,17 +0,0 @@
 name: "Pull Request Labeler"
 on:
 - pull_request_target
 jobs:
  labeler:
    permissions:
      contents: read
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -1,65 +0,0 @@
 name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get update
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -1,72 +0,0 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -1,22 +0,0 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -1,36 +0,0 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -1,35 +0,0 @@
 name: Python check requirements.txt
 on:
  push:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
        run:  bash scripts/check-requirements.sh
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -1,23 +0,0 @@
 name: flake8 Lint
 on: [push, pull_request]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
    name: Lint
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -1,38 +0,0 @@
 name: Python Type-Check
 on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-type-check:
    runs-on: ubuntu-latest
    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Python dependencies
        # TODO: use a venv
        run: pip install -r requirements/requirements-all.txt
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
          version: 1.1.370
          level: warning
          warnings: true
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -1,183 +0,0 @@
 # Server build and tests
 name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    steps:
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Verify server deps
        id: verify_server_deps
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server
          git ls-files --others --modified
          git status
          ./deps.sh
          git status
          not_ignored_files="$(git ls-files --others --modified)"
          echo "Modified files: ${not_ignored_files}"
          if [ -n "${not_ignored_files}" ]; then
            echo "Repository is dirty or server deps are not built as expected"
            echo "${not_ignored_files}"
            exit 1
          fi
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Tests
        id: server_integration_tests
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
  server-windows:
    runs-on: windows-2019
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: libCURL
        id: get_libcurl
        env:
          CURL_VERSION: 8.6.0_6
        run: |
          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
          mkdir $env:RUNNER_TEMP/libcurl
          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Copy Libcurl
        id: prepare_libcurl
        run: |
          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          behave.exe --stop --no-skipped --no-capture --tags slow
--- a/ci/README.md
+++ b/ci/README.md
@ -1,29 +0,0 @@
 # CI
 In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
 https://github.com/ggml-org/ci
 It monitors the `master` branch for new commits and runs the
 [ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
 to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
 to cover various hardware architectures, including GPU and Apple Silicon instances.
 Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
 Only the branches of this repo are monitored for this keyword.
 It is a good practice, before publishing changes to execute the full CI locally on your machine:
 ```bash
 mkdir tmp
 # CPU-only build
 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -1,775 +0,0 @@
 #/bin/bash
 #
 # sample usage:
 #
 # mkdir tmp
 #
 # # CPU-only build
 # bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
    exit 1
 fi
 mkdir -p "$1"
 mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
 rm -f "$OUT/*.log"
 rm -f "$OUT/*.exit"
 rm -f "$OUT/*.md"
 sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
 function gg_wget {
    local out=$1
    local url=$2
    local cwd=`pwd`
    mkdir -p $out
    cd $out
    # should not re-download if file is the same
    wget -nv -N $url
    cd $cwd
 }
 function gg_printf {
    printf -- "$@" >> $OUT/README.md
 }
 function gg_run {
    ci=$1
    set -o pipefail
    set -x
    gg_run_$ci | tee $OUT/$ci.log
    cur=$?
    echo "$cur" > $OUT/$ci.exit
    set +x
    set +o pipefail
    gg_sum_$ci
    ret=$((ret | cur))
 }
 ## ci
 # ctest_debug
 function gg_run_ctest_debug {
    cd ${SRC}
    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
 }
 function gg_sum_ctest_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 # ctest_release
 function gg_run_ctest_release {
    cd ${SRC}
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi
    set +e
 }
 function gg_sum_ctest_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 # test_scripts_debug
 function gg_run_test_scripts_debug {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 # test_scripts_release
 function gg_run_test_scripts_release {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 function gg_get_model {
    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
    elif [[ -s $gguf_1 ]]; then
        echo -n "$gguf_1"
    elif [[ -s $gguf_2 ]]; then
        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
    fi
 }
 function gg_run_ctest_with_model_debug {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_run_ctest_with_model_release {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_sum_ctest_with_model_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 function gg_sum_ctest_with_model_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA
 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/open-llama/7B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_open_llama_7b_v2 {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'OpenLLaMA 7B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # pythia_1.4b
 function gg_run_pythia_1_4b {
    cd ${SRC}
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/pythia/1.4B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/llama-save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_1_4b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 1.4B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # pythia_2_8b
 # requires: GG_BUILD_CUDA
 function gg_run_pythia_2_8b {
    cd ${SRC}
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/pythia/2.8B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/llama-save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_2_8b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 2.8B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # bge-small
 function gg_run_embd_bge_small {
    cd ${SRC}
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
    path_models="../models-mnt/bge-small"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    set +e
 }
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi
    if ! command -v make &> /dev/null; then
        gg_printf 'make not found, please install'
    fi
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
 }
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'BGE Small (BERT):\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
 }
 ## main
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    # Create a fresh python3 venv and enter it
    python3 -m venv "$MNT/venv"
    source "$MNT/venv/bin/activate"
    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
    pip install --editable gguf-py --disable-pip-version-check
 fi
 ret=0
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
        test $ret -eq 0 && gg_run test_scripts_debug
        test $ret -eq 0 && gg_run test_scripts_release
    fi
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run pythia_1_4b
        else
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
 exit $ret