Add YX simple filter for llama-server

2024-06-15 10:45:01 +08:00 · 2024-06-15 10:45:01 +08:00 · fcb2bb1222
commit fcb2bb1222
parent f8ec8877b7
18 changed files with 120 additions and 1698 deletions
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -1,310 +0,0 @@
 # Benchmark
 name: Benchmark
 on:
  workflow_dispatch:
    inputs:
      gpu-series:
        description: 'Azure GPU series to run with'
        required: true
        type: choice
        options:
          - Standard_NC4as_T4_v3
          - Standard_NC24ads_A100_v4
          - Standard_NC80adis_H100_v5
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      duration:
        description: 'Duration of the bench'
        type: string
        default: 10m
  push:
    branches:
      - master
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true
 jobs:
  bench-server-baseline:
    runs-on: Standard_NC4as_T4_v3
    env:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
    strategy:
      matrix:
        model: [phi-2]
        ftype: [q4_0, q8_0, f16]
        include:
          - model: phi-2
            ftype: q4_0
            pr_comment_enabled: "true"
    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
      || (
        github.event_name == 'schedule'
        && github.ref_name == 'master'
        && github.repository_owner == 'ggerganov'
      )
      || github.event_name == 'pull_request_target'
      || (
        github.event_name == 'push'
        && github.event.ref == 'refs/heads/master'
        && github.repository_owner == 'ggerganov'
      )
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Install python env
        id: pipenv
        run: |
          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
      - name: Prometheus
        id: install_prometheus
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.21'
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
          cd examples/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
      - name: Build
        id: cmake_build
        run: |
          set -eux
          cmake -B build \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
          cmake --build build --config Release -j $(nproc) --target llama-server
      - name: Download the dataset
        id: download_dataset
        run: |
          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
      - name: Server bench
        id: server_bench
        run: |
          set -eux
          cd examples/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
              --batch-size 2048 \
              --ubatch-size	256 \
              --ctx-size 16384 \
              --n-prompts 1000 \
              --max-prompt-tokens 1024 \
              --max-tokens 2048
          cat results.github.env >> $GITHUB_ENV
          # Remove dataset as we do not want it in the artefact
          rm ShareGPT_V3_unfiltered_cleaned_split.json
      - uses: actions/upload-artifact@v4
        with:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
            examples/server/bench/*.json
            examples/server/bench/*.log
      - name: Commit status
        uses: Sibz/github-status-action@v1
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'
      - name: Upload benchmark images
        uses: devicons/public-upload-to-imgur@v2.2.2
        continue-on-error: true # Important as it looks unstable: 503
        id: imgur_step
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
            examples/server/bench/prompt_tokens_seconds.jpg
            examples/server/bench/predicted_tokens_seconds.jpg
            examples/server/bench/kv_cache_usage_ratio.jpg
            examples/server/bench/requests_processing.jpg
      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux
          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
      - name: Extract image url
        id: extract_image_url
        continue-on-error: true
        run: |
          set -eux
          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
        with:
          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          message: |
            <p align="center">
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
            </p>
            <details>
            <summary>Expand details for performance related PR only</summary>
            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
            - ${{ env.BENCH_GRAPH_XLABEL }}
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
            <details>
            <summary>More</summary>
            ```mermaid
            ${{ env.PROMPT_TOKENS_SECONDS }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.PREDICTED_TOKENS_SECONDS }}
            ```
            </details>
            </p>
            <details>
            <summary>Details</summary>
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.KV_CACHE_USAGE_RATIO }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.REQUESTS_PROCESSING }}
            ```
            </details>
            </p>
            </details>
            </details>
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -25,662 +25,6 @@ env:
  GGML_N_THREADS: 1
 jobs:
  macOS-latest-cmake-arm64:
    runs-on: macos-14
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          mkdir build
          cd build
          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L 'main|curl' --verbose --timeout 900
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
          name: llama-bin-macos-arm64.zip
  macOS-latest-cmake-x64:
    runs-on: macos-12
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
          name: llama-bin-macos-x64.zip
  ubuntu-focal-make:
    runs-on: ubuntu-20.04
    env:
      LLAMA_NODE_AVAILABLE: true
      LLAMA_PYTHON_AVAILABLE: true
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8
      - uses: actions/setup-node@v4
        with:
          node-version: "20"
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          CC=gcc-8 make -j $(nproc)
      - name: Test
        id: make_test
        run: |
          CC=gcc-8 make tests -j $(nproc)
          make test -j $(nproc)
  ubuntu-focal-make-curl:
    runs-on: ubuntu-20.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
      - name: Build
        id: make_build
        env:
          LLAMA_FATAL_WARNINGS: 1
          LLAMA_CURL: 1
        run: |
          CC=gcc-8 make -j $(nproc)
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
          cmake --build . --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L 'main|curl' --verbose --timeout 900
      - name: Test llama2c conversion
        id: llama2c_test
        run: |
          cd build
          echo "Fetch tokenizer"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
          echo "Fetch llama2c model"
          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
          name: llama-bin-ubuntu-x64.zip
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
    continue-on-error: true
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          mkdir build
          cd build
          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-rpc:
    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake -DLLAMA_RPC=ON ..
          cmake --build . --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose
  ubuntu-22-cmake-vulkan:
    runs-on: ubuntu-22.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libvulkan-dev
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.0.2
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release -j $(nproc)
      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
          cmake --build build2 --config Release -j $(nproc)
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
    continue-on-error: true
    steps:
      - uses: actions/checkout@v2
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-sycl-fp16:
    runs-on: ubuntu-22.04
    continue-on-error: true
    steps:
      - uses: actions/checkout@v2
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)
  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: make_test
        run: |
          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          mkdir build
          cd build
          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
  macOS-latest-cmake-ios:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          mkdir build
          cd build
          cmake -G Xcode .. \
            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
  macOS-latest-cmake-tvos:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          mkdir build
          cd build
          cmake -G Xcode .. \
            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
  macOS-latest-swift:
    runs-on: macos-latest
    strategy:
      matrix:
        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
      - name: Build Swift Example
        id: make_build_swift_example
        run: |
            make swift
  windows-msys2:
    runs-on: windows-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
          - { sys: CLANG64, env: clang-x86_64, build: Release }
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
      - name: Build using make
        shell: msys2 {0}
        run: |
            make -j $(nproc)
      - name: Clean after building using make
        shell: msys2 {0}
        run: |
            make clean
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
            make LLAMA_OPENBLAS=1 -j $(nproc)
      - name: Build using CMake
        shell: msys2 {0}
        run: |
            cmake -B build
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
      - name: Clean after building using CMake
        shell: msys2 {0}
        run: |
            rm -rf build
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
  windows-latest-cmake:
    runs-on: windows-2019
@ -990,40 +334,6 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release
  ios-xcode-build:
    runs-on: macos-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
  android-build:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: Set up JDK
        uses: actions/setup-java@v3
        with:
          java-version: 17
          distribution: zulu
      - name: Setup Android SDK
        uses: android-actions/setup-android@v3
        with:
          log-accepted-android-sdk-licenses: false
      - name: Build
        run: |
          cd examples/llama.android
          ./gradlew build --no-daemon
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
@ -1047,14 +357,8 @@ jobs:
    runs-on: ubuntu-latest
    needs:
      - ubuntu-focal-make
      - ubuntu-latest-cmake
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-cuda
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64
    steps:
      - name: Clone
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -1,23 +0,0 @@
 name: Close inactive issues
 on:
  schedule:
    - cron: "42 0 * * *"
 jobs:
  close-issues:
    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write
    steps:
      - uses: actions/stale@v5
        with:
          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
          operations-per-run: 10000
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -1,40 +0,0 @@
 name: Code Coverage
 on: [push, pull_request]
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
      - name: Run tests
        run: CC=gcc-8 make test
      - name: Generate coverage report
        run: |
          make coverage
          make lcov-report
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -1,117 +0,0 @@
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 # GitHub recommends pinning actions to a commit SHA.
 # To get a newer version, you will need to update the SHA.
 # You can also reference a tag or branch, but the action may change without warning.
 name: Publish Docker image
 on:
  pull_request:
  push:
    branches:
      - master
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      matrix:
        config:
          - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
        with:
          registry: ghcr.io
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: false
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Downcase github.repository_owner
        run: |
          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
      - name: Build and push Docker image (tagged)
        uses: docker/build-push-action@v4
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -1,27 +0,0 @@
 name: EditorConfig Checker
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
        description: 'Create new release'
        required: true
        type: boolean
  push:
    branches:
      - master
  pull_request:
    branches:
      - master
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -1,44 +0,0 @@
 # This workflow will upload a Python Package using Twine when a GGUF release is created
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 # See `gguf-py/README.md` for how to make a release.
 # This workflow uses actions that are not certified by GitHub.
 # They are provided by a third-party and are governed by
 # separate terms of service, privacy policy, and support
 # documentation.
 name: Upload Python Package
 on:
  workflow_dispatch:
  push:
    # Pattern matched against refs/tags
    tags:
      - 'gguf-v*'           # Push events to every version tag
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python
      uses: actions/setup-python@v5
      with:
        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
        python -m pip install poetry
        poetry install
    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -1,17 +0,0 @@
 name: "Pull Request Labeler"
 on:
 - pull_request_target
 jobs:
  labeler:
    permissions:
      contents: read
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -1,65 +0,0 @@
 name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get update
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -1,72 +0,0 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -1,22 +0,0 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -1,36 +0,0 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -1,35 +0,0 @@
 name: Python check requirements.txt
 on:
  push:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
        run:  bash scripts/check-requirements.sh
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -1,23 +0,0 @@
 name: flake8 Lint
 on: [push, pull_request]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
    name: Lint
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -1,169 +0,0 @@
 # Server build and tests
 name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    steps:
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Verify server deps
        id: verify_server_deps
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server
          git ls-files --others --modified
          git status
          ./deps.sh
          git status
          not_ignored_files="$(git ls-files --others --modified)"
          echo "Modified files: ${not_ignored_files}"
          if [ -n "${not_ignored_files}" ]; then
            echo "Repository is dirty or server deps are not built as expected"
            echo "${not_ignored_files}"
            exit 1
          fi
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Tests
        id: server_integration_tests
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
  server-windows:
    runs-on: windows-2019
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: libCURL
        id: get_libcurl
        env:
          CURL_VERSION: 8.6.0_6
        run: |
          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
          mkdir $env:RUNNER_TEMP/libcurl
          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Copy Libcurl
        id: prepare_libcurl
        run: |
          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          behave.exe --stop --no-skipped --no-capture --tags slow
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -4,6 +4,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 set(TARGET_SRCS
    server.cpp
    stoplist.cpp
    utils.hpp
    httplib.h
 )
--- a/examples/server/stoplist.cpp
+++ b/examples/server/stoplist.cpp
@ -0,0 +1,10 @@
 #include "utils.hpp"
 std::set<const char *> SWordsFilter::stoplist = {
    "<|endoftext|>",
    "<|im_end|>",
    "<|startoftext|>",
    "<|im_start|>"
 };
 SWordsFilter stopped_filter;
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -8,9 +8,11 @@
 #include "json.hpp"
 #include <string>
 #include <cstdlib>
 #include <vector>
 #include <sstream>
 #include <random>
 #include <set>
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
@ -430,7 +432,107 @@ static json oaicompat_completion_params_parse(
    return llama_params;
 }
-static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+
 class SWordsFilter {
 std::map<std::string, std::string> scache;
 static std::set<const char * > stoplist;
 static size_t strcmpn(const char * a, const char * b, bool & nostop) {
    nostop = false;
    int k = 0;
    while(*b){
        if(*a){
            if(*a == *b){
                k++;
                a++;
                nostop = false;
            }else{
                nostop = true;
            }
        }
        b++;
    }
    return k;
 }
 static std::string replace_all(
    const std::string & content, const std::string & from, const std::string & to
 ){
    std::string ret;
    size_t pos = 0;
    size_t last = 0;
    while((pos = content.find(from, last)) != std::string::npos){
        ret += content.substr(last, pos - last);
        ret += to;
        last = pos + from.size();
    }
    ret += content.substr(last);
    return ret;
 }
 public:
    static void yx_simle_filter_init(){
        char * fname;
        fname = getenv("LLAMA_CPP_SERVER_STOPWORDS");
        do{
            if(fname != NULL){
                FILE * f = fopen(fname, "r");
                if(f == NULL){
                    LOG_WARNING("failed to open stopword file", {{"file", fname}});
                    break;
                }
                char buf[1024];
                while(fgets(buf, 1024, f)){
                    buf[strlen(buf)-1] = 0;
                    stoplist.insert(strdup(buf));
                }
                fclose(f);
            }
        }while(false);
        LOG_INFO("initialized stopwords filter module by Y.X.",
         {{"stoplist_size", stoplist.size()},
         {"file", fname == NULL ? "default" : fname},}
        );
    }
    void yx_simple_filter(std::string & content, const std::string & uid){
        if(content.size()==0 || stoplist.size()==0){
            return;
        }
        if(scache.find(uid) != scache.end()){
            content = scache[uid] + content;
            scache[uid]="";
        }
        bool cache = false;
        bool g_nostop = true;
        size_t max_allow = 0x7fffffff;
        for(const auto * s: stoplist){
            const char * cont = content.c_str();
            if(strstr(cont, s)){
                content = replace_all(content, s, "");
                LOG_INFO("hit stopword", {{"stopword", s}});
            }
        }
        for(const auto * s: stoplist){
            bool nostop;
            const char * cont = content.c_str();
            auto k = strcmpn(s, cont, nostop);
            if(k > 0){
                g_nostop = g_nostop && nostop;
                cache = true;
            }
            max_allow = std::min(max_allow, strlen(cont) - k);
        }
        if(cache && !g_nostop){
            scache[uid] = content.substr(max_allow);
            content = content.substr(0, max_allow);
            const char * ctx2 = scache[uid].c_str();
            LOG_INFO("cache stopword", {{"content", ctx2}});
        }
    }
    SWordsFilter(){
        yx_simle_filter_init();
    }
 };
 extern SWordsFilter stopped_filter;
 static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false) {
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@ -441,6 +543,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
    if (stopped_word || stopped_eos) {
        finish_reason = "stop";
    }
    // Add stopwords filter
    stopped_filter.yx_simple_filter(content, completion_id);
    json choices =
        streaming ? json::array({json{{"finish_reason", finish_reason},
@ -479,7 +583,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
 }
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({result});
    }
@ -500,6 +604,9 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
        finish_reason = "length";
    }
    // Add stopwords filter
    stopped_filter.yx_simple_filter(content, completion_id);
    std::time_t t = std::time(0);
    json choices;