diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
deleted file mode 100644
index 49812832c..000000000
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug-unconfirmed"]
-assignees: ''
-
----
-
-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-
-If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
deleted file mode 100644
index dcffda750..000000000
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 66ad85938..5ff5b119a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,949 +1,31 @@
-name: CI
+#
+# This source file is part of the Stanford Biodesign Digital Health Group open-source organization
+#
+# SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+#
+# SPDX-License-Identifier: MIT
+#
+
+name: Build XCArchive
on:
- workflow_dispatch: # allows manual triggering
- inputs:
- create_release:
- description: 'Create new release'
- required: true
- type: boolean
- push:
- branches:
- - master
- paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
- types: [opened, synchronize, reopened]
- paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
-
-env:
- BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- GGML_NLOOP: 3
- GGML_N_THREADS: 1
+ workflow_dispatch:
+ workflow_call:
+ inputs:
+ version:
+ description: 'The version number of the framework embedded in the XCArchives.'
+ type: string
+ required: true
jobs:
- ubuntu-focal-make:
- runs-on: ubuntu-20.04
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential gcc-8
-
- - name: Build
- id: make_build
- env:
- LLAMA_FATAL_WARNINGS: 1
- run: |
- CC=gcc-8 make -j $(nproc)
-
- - name: Test
- id: make_test
- run: |
- CC=gcc-8 make tests -j $(nproc)
- make test -j $(nproc)
-
- ubuntu-latest-cmake:
- runs-on: ubuntu-latest
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON
- cmake --build . --config Release -j $(nproc)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose --timeout 900
-
- ubuntu-latest-cmake-sanitizer:
- runs-on: ubuntu-latest
-
- continue-on-error: true
-
- strategy:
- matrix:
- sanitizer: [ADDRESS, THREAD, UNDEFINED]
- build_type: [Debug, Release]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
- cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose --timeout 900
-
- ubuntu-latest-cmake-mpi:
- runs-on: ubuntu-latest
-
- continue-on-error: true
-
- strategy:
- matrix:
- mpi_library: [mpich, libopenmpi-dev]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential ${{ matrix.mpi_library }}
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake -DLLAMA_MPI=ON ..
- cmake --build . --config Release -j $(nproc)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose
-
- ubuntu-22-cmake-sycl:
- runs-on: ubuntu-22.04
-
- continue-on-error: true
-
- steps:
- - uses: actions/checkout@v2
-
- - name: add oneAPI to apt
- shell: bash
- run: |
- cd /tmp
- wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
- - name: install oneAPI dpcpp compiler
- shell: bash
- run: |
- sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
- - name: install oneAPI MKL library
- shell: bash
- run: |
- sudo apt install intel-oneapi-mkl-devel
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Build
- id: cmake_build
- run: |
- source /opt/intel/oneapi/setvars.sh
- mkdir build
- cd build
- cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
- cmake --build . --config Release -j $(nproc)
-
- ubuntu-22-cmake-sycl-fp16:
- runs-on: ubuntu-22.04
-
- continue-on-error: true
-
- steps:
- - uses: actions/checkout@v2
-
- - name: add oneAPI to apt
- shell: bash
- run: |
- cd /tmp
- wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
- sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
- - name: install oneAPI dpcpp compiler
- shell: bash
- run: |
- sudo apt update
- sudo apt install intel-oneapi-compiler-dpcpp-cpp
-
- - name: install oneAPI MKL library
- shell: bash
- run: |
- sudo apt install intel-oneapi-mkl-devel
-
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Build
- id: cmake_build
- run: |
- source /opt/intel/oneapi/setvars.sh
- mkdir build
- cd build
- cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
- cmake --build . --config Release -j $(nproc)
-
- # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
- # how to debug it.
- # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
- macOS-latest-make:
- runs-on: macos-latest
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- - name: Build
- id: make_build
- env:
- LLAMA_FATAL_WARNINGS: 1
- run: |
- LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
- - name: Test
- id: make_test
- run: |
- LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
- LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
-
- # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
- # how to debug it.
- # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
- # would be great if we fix these
- macOS-latest-cmake:
- runs-on: macos-latest
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- - name: Build
- id: cmake_build
- run: |
- sysctl -a
- mkdir build
- cd build
- cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
- cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose --timeout 900
-
- macOS-latest-cmake-ios:
- runs-on: macos-latest
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v1
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- - name: Build
- id: cmake_build
- run: |
- sysctl -a
- mkdir build
- cd build
- cmake -G Xcode .. \
- -DLLAMA_BUILD_EXAMPLES=OFF \
- -DLLAMA_BUILD_TESTS=OFF \
- -DLLAMA_BUILD_SERVER=OFF \
- -DCMAKE_SYSTEM_NAME=iOS \
- -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
- cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
- macOS-latest-cmake-tvos:
- runs-on: macos-latest
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v1
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- - name: Build
- id: cmake_build
- run: |
- sysctl -a
- mkdir build
- cd build
- cmake -G Xcode .. \
- -DLLAMA_BUILD_EXAMPLES=OFF \
- -DLLAMA_BUILD_TESTS=OFF \
- -DLLAMA_BUILD_SERVER=OFF \
- -DCMAKE_SYSTEM_NAME=tvOS \
- -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
- cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
- macOS-latest-swift:
- runs-on: macos-latest
-
- strategy:
- matrix:
- destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v1
-
- - name: Dependencies
- id: depends
- continue-on-error: true
- run: |
- brew update
-
- - name: xcodebuild for swift package
- id: xcodebuild
- run: |
- xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
- - name: Build Swift Example
- id: make_build_swift_example
- run: |
- make swift
-
- windows-latest-cmake:
- runs-on: windows-latest
-
- env:
- OPENBLAS_VERSION: 0.3.23
- OPENCL_VERSION: 2023.04.17
- CLBLAST_VERSION: 1.6.0
- SDE_VERSION: 9.33.0-2024-01-07
- VULKAN_VERSION: 1.3.261.1
-
- strategy:
- matrix:
- include:
- - build: 'noavx'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- - build: 'avx2'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'avx'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- - build: 'avx512'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'clblast'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- - build: 'openblas'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- - build: 'kompute'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'vulkan'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Clone Kompute submodule
- id: clone_kompute
- if: ${{ matrix.build == 'kompute' }}
- run: |
- git submodule update --init kompute
-
- - name: Download OpenCL SDK
- id: get_opencl
- if: ${{ matrix.build == 'clblast' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
- mkdir $env:RUNNER_TEMP/opencl
- tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
- - name: Download CLBlast
- id: get_clblast
- if: ${{ matrix.build == 'clblast' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
- curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
- 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
- rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
- foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
- $txt = Get-Content -Path $f -Raw
- $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
- }
-
- - name: Download OpenBLAS
- id: get_openblas
- if: ${{ matrix.build == 'openblas' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
- curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
- mkdir $env:RUNNER_TEMP/openblas
- tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
- $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
- $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
- $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
- & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
- - name: Install Vulkan SDK
- id: get_vulkan
- if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
- & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
- Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
- Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. ${{ matrix.defines }}
- cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
- - name: Add clblast.dll
- id: add_clblast_dll
- if: ${{ matrix.build == 'clblast' }}
- run: |
- cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
- cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
- - name: Add libopenblas.dll
- id: add_libopenblas_dll
- if: ${{ matrix.build == 'openblas' }}
- run: |
- cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
- cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
- - name: Check AVX512F support
- id: check_avx512f
- if: ${{ matrix.build == 'avx512' }}
- continue-on-error: true
- run: |
- cd build
- $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
- $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
- $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
- echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
- & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
- .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
-
- - name: Test
- id: cmake_test
- # not all machines have native AVX-512
- if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
- run: |
- cd build
- ctest -L main -C Release --verbose --timeout 900
-
- - name: Test (Intel SDE)
- id: cmake_test_sde
- if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
- run: |
- curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
- # for some weird reason windows tar doesn't like sde tar.xz
- 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
- 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
- $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
- cd build
- & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
- - name: Determine tag name
- id: tag
- shell: bash
- run: |
- BUILD_NUMBER="$(git rev-list --count HEAD)"
- SHORT_HASH="$(git rev-parse --short=7 HEAD)"
- if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
- echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
- else
- SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
- echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
- fi
-
- - name: Pack artifacts
- id: pack_artifacts
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- run: |
- Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
- 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
-
- - name: Upload artifacts
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: actions/upload-artifact@v3
- with:
- path: |
- llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
-
- windows-latest-cmake-cublas:
- runs-on: windows-latest
-
- strategy:
- matrix:
- cuda: ['12.2.0', '11.7.1']
- build: ['cublas']
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - uses: Jimver/cuda-toolkit@v0.2.11
- id: cuda-toolkit
- with:
- cuda: ${{ matrix.cuda }}
- method: 'network'
- sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
- cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
- - name: Determine tag name
- id: tag
- shell: bash
- run: |
- BUILD_NUMBER="$(git rev-list --count HEAD)"
- SHORT_HASH="$(git rev-parse --short=7 HEAD)"
- if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
- echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
- else
- SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
- echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
- fi
-
- - name: Pack artifacts
- id: pack_artifacts
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- run: |
- 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
- - name: Upload artifacts
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: actions/upload-artifact@v3
- with:
- path: |
- llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-
- - name: Copy and pack Cuda runtime
- run: |
- echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
- $dst='.\build\bin\cudart\'
- robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
- 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
-
- - name: Upload Cuda runtime
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
- uses: actions/upload-artifact@v3
- with:
- path: |
- cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
- windows-latest-cmake-sycl:
- runs-on: windows-latest
- defaults:
- run:
- shell: bash
-
- env:
- WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
- WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Install
- run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
- - name: Build
- id: cmake_build
- run: examples/sycl/win-build-sycl.bat
-
- ios-xcode-build:
- runs-on: macos-latest
-
- steps:
- - name: Checkout code
- uses: actions/checkout@v3
-
- - name: Build Xcode project
- run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
-
- android-build:
- runs-on: ubuntu-latest
-
- steps:
- - name: Clone
- uses: actions/checkout@v3
-
- - name: Set up JDK
- uses: actions/setup-java@v3
- with:
- java-version: 17
- distribution: zulu
-
- - name: Setup Android SDK
- uses: android-actions/setup-android@v3
- with:
- log-accepted-android-sdk-licenses: false
-
- - name: Build
- run: |
- cd examples/llama.android
-
- ./gradlew build --no-daemon
-
-# freeBSD-latest:
-# runs-on: macos-12
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Build
-# uses: cross-platform-actions/action@v0.19.0
-# with:
-# operating_system: freebsd
-# version: '13.2'
-# hypervisor: 'qemu'
-# run: |
-# sudo pkg update
-# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
-
- release:
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
- runs-on: ubuntu-latest
-
- needs:
- - ubuntu-focal-make
- - ubuntu-latest-cmake
- - macOS-latest-make
- - macOS-latest-cmake
- - windows-latest-cmake
- - windows-latest-cmake-cublas
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Determine tag name
- id: tag
- shell: bash
- run: |
- BUILD_NUMBER="$(git rev-list --count HEAD)"
- SHORT_HASH="$(git rev-parse --short=7 HEAD)"
- if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
- echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
- else
- SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
- echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
- fi
-
- - name: Download artifacts
- id: download-artifact
- uses: actions/download-artifact@v3
-
- - name: Create release
- id: create_release
- uses: anzz1/action-create-release@v1
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- with:
- tag_name: ${{ steps.tag.outputs.name }}
-
- - name: Upload release
- id: upload_release
- uses: actions/github-script@v3
- with:
- github-token: ${{secrets.GITHUB_TOKEN}}
- script: |
- const path = require('path');
- const fs = require('fs');
- const release_id = '${{ steps.create_release.outputs.id }}';
- for (let file of await fs.readdirSync('./artifact')) {
- if (path.extname(file) === '.zip') {
- console.log('uploadReleaseAsset', file);
- await github.repos.uploadReleaseAsset({
- owner: context.repo.owner,
- repo: context.repo.repo,
- release_id: release_id,
- name: file,
- data: await fs.readFileSync(`./artifact/${file}`)
- });
- }
- }
-
-# ubuntu-latest-gcc:
-# runs-on: ubuntu-latest
-#
-# strategy:
-# matrix:
-# build: [Debug, Release]
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Dependencies
-# run: |
-# sudo apt-get update
-# sudo apt-get install build-essential
-# sudo apt-get install cmake
-#
-# - name: Configure
-# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-# - name: Build
-# run: |
-# make
-#
-# ubuntu-latest-clang:
-# runs-on: ubuntu-latest
-#
-# strategy:
-# matrix:
-# build: [Debug, Release]
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Dependencies
-# run: |
-# sudo apt-get update
-# sudo apt-get install build-essential
-# sudo apt-get install cmake
-#
-# - name: Configure
-# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-#
-# - name: Build
-# run: |
-# make
-#
-# ubuntu-latest-gcc-sanitized:
-# runs-on: ubuntu-latest
-#
-# strategy:
-# matrix:
-# sanitizer: [ADDRESS, THREAD, UNDEFINED]
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Dependencies
-# run: |
-# sudo apt-get update
-# sudo apt-get install build-essential
-# sudo apt-get install cmake
-#
-# - name: Configure
-# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
-#
-# - name: Build
-# run: |
-# make
-#
-# windows:
-# runs-on: windows-latest
-#
-# strategy:
-# matrix:
-# build: [Release]
-# arch: [Win32, x64]
-# include:
-# - arch: Win32
-# s2arc: x86
-# - arch: x64
-# s2arc: x64
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Add msbuild to PATH
-# uses: microsoft/setup-msbuild@v1
-#
-# - name: Configure
-# run: >
-# cmake -S . -B ./build -A ${{ matrix.arch }}
-# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-# - name: Build
-# run: |
-# cd ./build
-# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-# - name: Upload binaries
-# uses: actions/upload-artifact@v1
-# with:
-# name: llama-bin-${{ matrix.arch }}
-# path: build/bin/${{ matrix.build }}
-#
-# windows-blas:
-# runs-on: windows-latest
-#
-# strategy:
-# matrix:
-# build: [Release]
-# arch: [Win32, x64]
-# blas: [ON]
-# include:
-# - arch: Win32
-# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-# s2arc: x86
-# - arch: x64
-# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-# s2arc: x64
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Add msbuild to PATH
-# uses: microsoft/setup-msbuild@v1
-#
-# - name: Fetch OpenBLAS
-# if: matrix.blas == 'ON'
-# run: |
-# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-# 7z x blas.zip -oblas -y
-# copy blas/include/cblas.h .
-# copy blas/include/openblas_config.h .
-# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
-#
-# - name: Configure
-# run: >
-# cmake -S . -B ./build -A ${{ matrix.arch }}
-# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
-# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-#
-# - name: Build
-# run: |
-# cd ./build
-# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-# - name: Copy libopenblas.dll
-# if: matrix.blas == 'ON'
-# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
-#
-# - name: Upload binaries
-# if: matrix.blas == 'ON'
-# uses: actions/upload-artifact@v1
-# with:
-# name: llama-blas-bin-${{ matrix.arch }}
-# path: build/bin/${{ matrix.build }}
-#
-# emscripten:
-# runs-on: ubuntu-latest
-#
-# strategy:
-# matrix:
-# build: [Release]
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v3
-#
-# - name: Dependencies
-# run: |
-# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-# tar -xvf master.tar.gz
-# emsdk-master/emsdk update
-# emsdk-master/emsdk install latest
-# emsdk-master/emsdk activate latest
-#
-# - name: Configure
-# run: echo "tmp"
-#
-# - name: Build
-# run: |
-# pushd emsdk-master
-# source ./emsdk_env.sh
-# popd
-# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-# make
+ build-xcarchive:
+ uses: StanfordBDHG/.github/.github/workflows/archive.yml@v2
+ with:
+ workspaceFile: 'llama.xcworkspace'
+ xcArchiveName: 'llama'
+ scheme: 'llama'
+ sdk: '["iphoneos", "iphonesimulator", "macosx", "xros", "xrsimulator"]'
+ version: ${{ inputs.version }}
+ configuration: 'Release'
+ runsonlabels: '["macOS", "self-hosted"]'
\ No newline at end of file
diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
deleted file mode 100644
index 392db8a08..000000000
--- a/.github/workflows/code-coverage.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: Code Coverage
-on: [push, pull_request]
-
-env:
- GGML_NLOOP: 3
- GGML_N_THREADS: 1
-
-jobs:
- run:
- runs-on: ubuntu-20.04
- steps:
- - name: Checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- run: |
- sudo apt-get update
- sudo apt-get install build-essential gcc-8 lcov
-
- - name: Build
- run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
-
- - name: Run tests
- run: CC=gcc-8 make test
-
- - name: Generate coverage report
- run: |
- make coverage
- make lcov-report
-
- - name: Upload coverage to Codecov
- uses: codecov/codecov-action@v3
- env:
- CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
- with:
- files: lcov-report/coverage.info
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 94f9161fc..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,107 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
- pull_request:
- push:
- branches:
- - master
-
-jobs:
- push_to_registry:
- name: Push Docker image to Docker Hub
- if: github.event.pull_request.draft == false
-
- runs-on: ubuntu-latest
- env:
- COMMIT_SHA: ${{ github.sha }}
- strategy:
- matrix:
- config:
- - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
- # have disabled them for now until the reason why
- # is understood.
- - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
- - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
- - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
- steps:
- - name: Check out the repo
- uses: actions/checkout@v3
-
- - name: Set up QEMU
- uses: docker/setup-qemu-action@v2
-
- - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v2
-
- - name: Log in to Docker Hub
- uses: docker/login-action@v2
- with:
- registry: ghcr.io
- username: ${{ github.repository_owner }}
- password: ${{ secrets.GITHUB_TOKEN }}
-
- # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- - name: Free Disk Space (Ubuntu)
- uses: jlumbroso/free-disk-space@main
- with:
- # this might remove tools that are actually needed,
- # if set to "true" but frees about 6 GB
- tool-cache: false
-
- # all of these default to true, but feel free to set to
- # "false" if necessary for your workflow
- android: true
- dotnet: true
- haskell: true
- large-packages: true
- docker-images: true
- swap-storage: true
-
- - name: Determine tag name
- id: tag
- shell: bash
- run: |
- BUILD_NUMBER="$(git rev-list --count HEAD)"
- SHORT_HASH="$(git rev-parse --short=7 HEAD)"
- if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
- echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
- else
- SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
- echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
- fi
-
- - name: Build and push Docker image (versioned)
- if: github.event_name == 'push'
- uses: docker/build-push-action@v4
- with:
- context: .
- push: true
- platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
- file: ${{ matrix.config.dockerfile }}
-
- - name: Build and push Docker image (tagged)
- uses: docker/build-push-action@v4
- with:
- context: .
- push: ${{ github.event_name == 'push' }}
- platforms: ${{ matrix.config.platforms }}
- tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
- file: ${{ matrix.config.dockerfile }}
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index 0e0993cd4..000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: EditorConfig Checker
-
-on:
- workflow_dispatch: # allows manual triggering
- inputs:
- create_release:
- description: 'Create new release'
- required: true
- type: boolean
- push:
- branches:
- - master
- pull_request:
- branches:
- - master
-
-jobs:
- editorconfig:
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v3
- - uses: editorconfig-checker/action-editorconfig-checker@main
- - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
deleted file mode 100644
index 57db17512..000000000
--- a/.github/workflows/gguf-publish.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
- workflow_dispatch:
- push:
- # Pattern matched against refs/tags
- tags:
- - 'gguf-v*' # Push events to every version tag
-
-
-jobs:
- deploy:
-
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v3
- - name: Set up Python
- uses: actions/setup-python@v2
- with:
- python-version: '3.9.x'
- - name: Install dependencies
- run: |
- cd gguf-py
- python -m pip install poetry
- poetry install
-
- - name: Build package
- run: cd gguf-py && poetry build
- - name: Publish package
- uses: pypa/gh-action-pypi-publish@release/v1
- with:
- password: ${{ secrets.PYPI_API_TOKEN }}
- packages-dir: gguf-py/dist
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
deleted file mode 100644
index 8d0a3fd7f..000000000
--- a/.github/workflows/nix-ci-aarch64.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: Nix aarch64 builds
-
-on:
- workflow_dispatch: # allows manual triggering
- schedule:
- # Rebuild daily rather than on every push because QEMU is expensive (e.g.
- # 1.5h instead of minutes with the cold cache).
- #
- # randint(0, 59), randint(0, 23)
- - cron: '26 12 * * *'
- # But also rebuild if we touched any of the Nix expressions:
- push:
- branches:
- - master
- paths: ['**/*.nix', 'flake.lock']
- pull_request:
- types: [opened, synchronize, reopened]
- paths: ['**/*.nix', 'flake.lock']
-
-jobs:
- nix-build-aarch64:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
- - name: Install QEMU
- # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
- run: |
- sudo apt-get update
- sudo apt-get install -y qemu-user-static qemu-system-aarch64
- sudo usermod -a -G kvm $USER
- - name: Install Nix
- uses: DeterminateSystems/nix-installer-action@v9
- with:
- github-token: ${{ secrets.GITHUB_TOKEN }}
- extra-conf: |
- extra-platforms = aarch64-linux
- extra-system-features = nixos-test kvm
- extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
- extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- - uses: DeterminateSystems/magic-nix-cache-action@v2
- with:
- upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- - name: Set-up cachix to push the results to
- uses: cachix/cachix-action@v13
- with:
- authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
- name: llama-cpp
- - name: Show all output paths
- run: >
- nix run github:nix-community/nix-eval-jobs
- -- --gc-roots-dir gcroot
- --flake
- ".#packages.aarch64-linux"
- - name: Build
- run: >
- nix run github:Mic92/nix-fast-build
- -- --skip-cached --no-nom
- --systems aarch64-linux
- --flake
- ".#checks.aarch64-linux"
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
deleted file mode 100644
index 01c5a9d5a..000000000
--- a/.github/workflows/nix-ci.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: Nix CI
-
-on:
- workflow_dispatch: # allows manual triggering
- push:
- branches:
- - master
- pull_request:
- types: [opened, synchronize, reopened]
-
-jobs:
- nix-eval:
- strategy:
- fail-fast: false
- matrix:
- os: [ ubuntu-latest, macos-latest ]
- runs-on: ${{ matrix.os }}
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
- - name: Install Nix
- uses: DeterminateSystems/nix-installer-action@v9
- with:
- github-token: ${{ secrets.GITHUB_TOKEN }}
- extra-conf: |
- extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
- extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- - uses: DeterminateSystems/magic-nix-cache-action@v2
- with:
- upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- - name: List all flake outputs
- run: nix flake show --all-systems
- - name: Show all output paths
- run: >
- nix run github:nix-community/nix-eval-jobs
- -- --gc-roots-dir gcroot
- --flake
- ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
- nix-build:
- strategy:
- fail-fast: false
- matrix:
- os: [ ubuntu-latest, macos-latest ]
- runs-on: ${{ matrix.os }}
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
- - name: Install Nix
- uses: DeterminateSystems/nix-installer-action@v9
- with:
- github-token: ${{ secrets.GITHUB_TOKEN }}
- extra-conf: |
- extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
- extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
- - uses: DeterminateSystems/magic-nix-cache-action@v2
- with:
- upstream-cache: https://${{ matrix.cachixName }}.cachix.org
- - name: Set-up cachix to push the results to
- uses: cachix/cachix-action@v13
- with:
- authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
- name: llama-cpp
- - name: Build
- run: >
- nix run github:Mic92/nix-fast-build
- -- --skip-cached --no-nom
- --flake
- ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml
deleted file mode 100644
index 3a6a96e26..000000000
--- a/.github/workflows/nix-flake-update.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
- workflow_dispatch:
- schedule:
- - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
- lockfile:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout repository
- uses: actions/checkout@v4
- - name: Install Nix
- uses: DeterminateSystems/nix-installer-action@main
- - name: Update flake.lock
- uses: DeterminateSystems/update-flake-lock@main
- with:
- pr-title: "nix: update flake.lock"
- pr-labels: |
- nix
- pr-reviewers: philiptaron,SomeoneSerge
- token: ${{ secrets.FLAKE_TOKEN }}
diff --git a/.github/workflows/nix-publish-flake.yml b/.github/workflows/nix-publish-flake.yml
deleted file mode 100644
index 2c3c1ebda..000000000
--- a/.github/workflows/nix-publish-flake.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
- push:
- tags:
- - "*"
- workflow_dispatch:
- inputs:
- tag:
- description: "The existing tag to publish"
- type: "string"
- required: true
-jobs:
- flakestry-publish:
- runs-on: ubuntu-latest
- permissions:
- id-token: "write"
- contents: "read"
- steps:
- - uses: flakestry/flakestry-publish@main
- with:
- version: "${{ inputs.tag || github.ref_name }}"
- flakehub-publish:
- runs-on: "ubuntu-latest"
- permissions:
- id-token: "write"
- contents: "read"
- steps:
- - uses: "actions/checkout@v4"
- with:
- ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
- - uses: "DeterminateSystems/nix-installer-action@main"
- - uses: "DeterminateSystems/flakehub-push@main"
- with:
- visibility: "public"
- tag: "${{ inputs.tag }}"
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index 92e1108b3..000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Python check requirements.txt
-
-on:
- push:
- paths:
- - 'scripts/check-requirements.sh'
- - 'convert*.py'
- - 'requirements.txt'
- - 'requirements/*.txt'
- pull_request:
- paths:
- - 'scripts/check-requirements.sh'
- - 'convert*.py'
- - 'requirements.txt'
- - 'requirements/*.txt'
-
-jobs:
- python-check-requirements:
- runs-on: ubuntu-latest
- name: check-requirements
- steps:
- - name: Check out source repository
- uses: actions/checkout@v3
- - name: Set up Python environment
- uses: actions/setup-python@v4
- with:
- python-version: "3.11"
- - name: Run check-requirements.sh script
- run: bash scripts/check-requirements.sh nocleanup
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
deleted file mode 100644
index ea0a05ea1..000000000
--- a/.github/workflows/python-lint.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: flake8 Lint
-
-on: [push, pull_request]
-
-jobs:
- flake8-lint:
- runs-on: ubuntu-latest
- name: Lint
- steps:
- - name: Check out source repository
- uses: actions/checkout@v3
- - name: Set up Python environment
- uses: actions/setup-python@v4
- with:
- python-version: "3.11"
- - name: flake8 Lint
- uses: py-actions/flake8@v2
- with:
- ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
- exclude: "examples/*,examples/*/**,*/**/__init__.py"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000..620b018e1
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,32 @@
+#
+# This source file is part of the Stanford Biodesign Digital Health Group open-source organization
+#
+# SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+#
+# SPDX-License-Identifier: MIT
+#
+
+name: Create XCFramework and Release
+
+on:
+ workflow_dispatch:
+ inputs:
+ version:
+ description: 'The version number of the framework embedded in the XCArchives. This version number is also used as the release tag.'
+ type: string
+ required: true
+
+jobs:
+ create-xcframework-and-release-workflow:
+ uses: StanfordBDHG/.github/.github/workflows/xcframework.yml@v2
+ with:
+ workspaceFile: llama.xcworkspace
+ xcFrameworkName: llama
+ scheme: llama
+ sdk: '["iphoneos", "iphonesimulator", "macosx", "xros", "xrsimulator"]'
+ version: ${{ inputs.version }}
+ configuration: Release
+ runsonlabels: '["macOS", "self-hosted"]'
+ user: PaulsAutomationBot
+ secrets:
+ access-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
deleted file mode 100644
index 0b6f6669b..000000000
--- a/.github/workflows/server.yml
+++ /dev/null
@@ -1,83 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
- workflow_dispatch: # allows manual triggering
- push:
- branches:
- - master
- paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
- pull_request:
- types: [opened, synchronize, reopened]
- paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
-
-jobs:
- server:
- runs-on: ubuntu-latest
-
- strategy:
- matrix:
- sanitizer: [ADDRESS, THREAD, UNDEFINED]
- build_type: [Debug, Release]
- include:
- - build_type: Release
- sanitizer: ""
- exclude:
- - build_type: Release
- sanitizer: ADDRESS
- - build_type: Release
- sanitizer: THREAD
- - build_type: Release
- sanitizer: UNDEFINED
-
- container:
- image: ubuntu:latest
- ports:
- - 8888
- options: --cpus 4
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v3
-
- - name: Dependencies
- id: depends
- run: |
- apt-get update
- apt-get -y install \
- build-essential \
- git \
- cmake \
- python3-pip \
- wget \
- psmisc
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake .. \
- -DLLAMA_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
- -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
- cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
-
- - name: Tests dependencies
- id: test_dependencies
- run: |
- pip install -r examples/server/tests/requirements.txt
-
- - name: Download models
- id: download_models
- run: |
- cd examples/server/tests
- ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
-
- - name: Tests
- id: server_integration_test
- run: |
- cd examples/server/tests
- PORT=8888 ./tests.sh
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
deleted file mode 100644
index 03652760c..000000000
--- a/.github/workflows/tidy-post.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
- workflow_dispatch:
- workflows: ["clang-tidy-review"]
- types:
- - completed
-
-jobs:
- build:
- runs-on: ubuntu-latest
-
- steps:
- - uses: ZedThree/clang-tidy-review/post@v0.13.0
- # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
- with:
- # adjust options as necessary
- lgtm_comment_body: ''
- annotations: false
- max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
deleted file mode 100644
index a4bc8d976..000000000
--- a/.github/workflows/tidy-review.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
- pull_request:
- branches:
- - master
-
-jobs:
- clang-tidy-review:
- runs-on: ubuntu-latest
-
- steps:
- - uses: actions/checkout@v3
-
- - uses: ZedThree/clang-tidy-review@v0.13.0
- id: review
- with:
- lgtm_comment_body: ''
- build_dir: build
- cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
- split_workflow: true
-
- - uses: ZedThree/clang-tidy-review/upload@v0.13.0
diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
deleted file mode 100644
index 68a698ab9..000000000
--- a/.github/workflows/zig-build.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: Zig CI
-
-on:
- pull_request:
- push:
- branches:
- - master
-
-jobs:
- build:
- strategy:
- fail-fast: false
- matrix:
- runs-on: [ubuntu-latest, macos-latest, windows-latest]
- runs-on: ${{ matrix.runs-on }}
- steps:
- - uses: actions/checkout@v3
- with:
- submodules: recursive
- fetch-depth: 0
- - uses: goto-bus-stop/setup-zig@v2
- with:
- version: 0.11.0
- - name: Build Summary
- run: zig build --summary all -freference-trace
diff --git a/.gitignore b/.gitignore
index 62b6b8b1a..7791e9a25 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@
*.gcda
*.dot
*.bat
-*.metallib
.DS_Store
.build/
.cache/
@@ -92,3 +91,25 @@ examples/jeopardy/results.txt
poetry.lock
poetry.toml
nppBackup
+
+.swiftpm
+.build
+*.pyc
+.DS_Store
+# Xcode
+build/
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+xcuserdata
+*.xccheckout
+*.moved-aside
+DerivedData
+*.hmap
+*.ipa
+*.xcuserstate
\ No newline at end of file
diff --git a/Package.swift b/Package.swift
index b24c9204a..d116f9934 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,60 +1,35 @@
-// swift-tools-version:5.5
+// swift-tools-version:5.9
+
+//
+// This source file is part of the TemplatePackage open source project
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
import PackageDescription
+
let package = Package(
name: "llama",
platforms: [
- .macOS(.v12),
- .iOS(.v14),
- .watchOS(.v4),
- .tvOS(.v14)
+ .iOS(.v17),
+ .visionOS(.v1),
+ .macOS(.v14)
],
products: [
- .library(name: "llama", targets: ["llama"]),
- ],
- targets: [
- .target(
+ .library(
name: "llama",
- path: ".",
- exclude: [
- "cmake",
- "examples",
- "scripts",
- "models",
- "tests",
- "CMakeLists.txt",
- "ggml-cuda.cu",
- "ggml-cuda.h",
- "Makefile"
- ],
- sources: [
- "ggml.c",
- "llama.cpp",
- "ggml-alloc.c",
- "ggml-backend.c",
- "ggml-quants.c",
- "ggml-metal.m",
- ],
- resources: [
- .process("ggml-metal.metal")
- ],
- publicHeadersPath: "spm-headers",
- cSettings: [
- .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
- .define("GGML_USE_ACCELERATE"),
- .unsafeFlags(["-fno-objc-arc"]),
- .define("GGML_USE_METAL"),
- // NOTE: NEW_LAPACK will required iOS version 16.4+
- // We should consider add this in the future when we drop support for iOS 14
- // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
- // .define("ACCELERATE_NEW_LAPACK"),
- // .define("ACCELERATE_LAPACK_ILP64")
- ],
- linkerSettings: [
- .linkedFramework("Accelerate")
+ targets: [
+ "llama"
]
)
],
- cxxLanguageStandard: .cxx11
+ targets: [
+ .binaryTarget(
+ name: "llama",
+ path: "./llama.xcframework"
+ )
+ ]
)
diff --git a/README.md b/README.md
index 5401e197f..04433e4f9 100644
--- a/README.md
+++ b/README.md
@@ -1,1069 +1,77 @@
-# llama.cpp
+
+
+# Stanford BDHG llama.cpp

[](https://opensource.org/licenses/MIT)
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+## Overview
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
+This project is a Stanford BDHG-maintained fork of the well-regarded [llama.cpp](https://github.com/ggerganov/llama.cpp), tailored for deploying [LLaMA](https://arxiv.org/abs/2302.13971) models using C/C++. Our modifications package the library as an XCFramework for distribution as a binary compatible with multiple platforms. The inclusion of a `Package.swift` file facilitates the integration with the Swift Package Manager (SPM).
-### Hot topics
+> [!NOTE]
+> Should you have inquiries regarding the llama.cpp codebase this fork builds upon, please refer to the [upstream llama.cpp README](https://github.com/ggerganov/llama.cpp/blob/master/README.md) for comprehensive details and guidance.
-- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
-- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
-- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
-----
+## Setup
-
- Table of Contents
-
- -
- Description
-
- -
- Usage
-
-
- - Contributing
- - Coding guidelines
- - Docs
-
-
+### Add Stanford BDHG llama.cpp as a Dependency
-## Description
+You need to add Stanford BDHG llama.cpp Swift package to
+[your app in Xcode](https://developer.apple.com/documentation/xcode/adding-package-dependencies-to-your-app#) or
+[Swift package](https://developer.apple.com/documentation/xcode/creating-a-standalone-swift-package-with-xcode#Add-a-dependency-on-another-Swift-package).
-The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
-
-- Plain C/C++ implementation without any dependencies
-- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
-- AVX, AVX2 and AVX512 support for x86 architectures
-- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
-- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
-- Vulkan, SYCL, and (partial) OpenCL backend support
-- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
-
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-improved significantly thanks to many contributions. It is the main playground for developing new features for the
-[ggml](https://github.com/ggerganov/ggml) library.
-
-**Supported platforms:**
-
-- [X] Mac OS
-- [X] Linux
-- [X] Windows (via CMake)
-- [X] Docker
-- [X] FreeBSD
-
-**Supported models:**
-
-Typically finetunes of the base models below are supported as well.
-
-- [X] LLaMA 🦙
-- [x] LLaMA 2 🦙🦙
-- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
-- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
-- [X] Falcon
-- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
-- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
-- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
-- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
-- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
-- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
-- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
-- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
-- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
-- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
-- [X] [StableLM models](https://huggingface.co/stabilityai)
-- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
-- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
-- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
-- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
-- [x] [GPT-2](https://huggingface.co/gpt2)
-- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
-- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
-- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
-- [x] [Gemma](https://ai.google.dev/gemma)
-
-**Multimodal models:**
-
-- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
-- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
-- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
-- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
-- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
-- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
-
-**HTTP server**
-
-[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-**Bindings:**
-
-- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
-- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
-- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
-- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
-- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
-- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
-- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
-- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
-- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
-- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
-- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
-- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
-- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
-- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
-- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
-
-**UI:**
-
-Unless otherwise noted these projects are open-source with permissive licensing:
-
-- [iohub/collama](https://github.com/iohub/coLLaMA)
-- [janhq/jan](https://github.com/janhq/jan) (AGPL)
-- [nat/openplayground](https://github.com/nat/openplayground)
-- [Faraday](https://faraday.dev/) (proprietary)
-- [LMStudio](https://lmstudio.ai/) (proprietary)
-- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
-- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
-- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
-- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
-- [ollama/ollama](https://github.com/ollama/ollama)
-- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
-- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
-- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
-- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
-- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
-- [semperai/amica](https://github.com/semperai/amica)
-- [withcatai/catai](https://github.com/withcatai/catai)
-- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
-- [Msty](https://msty.app) (proprietary)
-- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
-
----
-
-Here is a typical run using LLaMA v2 13B on M2 Ultra:
-
-```
-$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
-I UNAME_S: Darwin
-I UNAME_P: arm
-I UNAME_M: arm64
-I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
-I LDFLAGS: -framework Accelerate
-I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-
-make: Nothing to be done for `default'.
-main: build = 1041 (cf658ad)
-main: seed = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type f32: 81 tensors
-llama_model_loader: - type q4_0: 281 tensors
-llama_model_loader: - type q6_K: 1 tensors
-llm_load_print_meta: format = GGUF V1 (latest)
-llm_load_print_meta: arch = llama
-llm_load_print_meta: vocab type = SPM
-llm_load_print_meta: n_vocab = 32000
-llm_load_print_meta: n_merges = 0
-llm_load_print_meta: n_ctx_train = 4096
-llm_load_print_meta: n_ctx = 512
-llm_load_print_meta: n_embd = 5120
-llm_load_print_meta: n_head = 40
-llm_load_print_meta: n_head_kv = 40
-llm_load_print_meta: n_layer = 40
-llm_load_print_meta: n_rot = 128
-llm_load_print_meta: n_gqa = 1
-llm_load_print_meta: f_norm_eps = 1.0e-05
-llm_load_print_meta: f_norm_rms_eps = 1.0e-05
-llm_load_print_meta: n_ff = 13824
-llm_load_print_meta: freq_base = 10000.0
-llm_load_print_meta: freq_scale = 1
-llm_load_print_meta: model type = 13B
-llm_load_print_meta: model ftype = mostly Q4_0
-llm_load_print_meta: model size = 13.02 B
-llm_load_print_meta: general.name = LLaMA v2
-llm_load_print_meta: BOS token = 1 ''
-llm_load_print_meta: EOS token = 2 ''
-llm_load_print_meta: UNK token = 0 ''
-llm_load_print_meta: LF token = 13 '<0x0A>'
-llm_load_tensors: ggml ctx size = 0.11 MB
-llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
-...................................................................................................
-llama_new_context_with_model: kv self size = 400.00 MB
-llama_new_context_with_model: compute buffer total size = 75.41 MB
-
-system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
-
-
- Building a website can be done in 10 simple steps:
-Step 1: Find the right website platform.
-Step 2: Choose your domain name and hosting plan.
-Step 3: Design your website layout.
-Step 4: Write your website content and add images.
-Step 5: Install security features to protect your site from hackers or spammers
-Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-Step 8: Start marketing and promoting the website via social media channels or paid ads
-Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-How does a Website Work?
-A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-How to
-llama_print_timings: load time = 576.45 ms
-llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
-llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
-llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
-llama_print_timings: total time = 25431.49 ms
-```
-
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
-
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
-
-## Usage
-
-Here are the end-to-end binary build and model conversion steps for most supported models.
-
-### Get the Code
-
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-### Build
-
-In order to build llama.cpp you have three different options.
-
-- Using `make`:
- - On Linux or MacOS:
-
- ```bash
- make
- ```
-
- - On Windows:
-
- 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
- 2. Extract `w64devkit` on your pc.
- 3. Run `w64devkit.exe`.
- 4. Use the `cd` command to reach the `llama.cpp` folder.
- 5. From here you can run:
- ```bash
- make
- ```
-
-- Using `CMake`:
-
- ```bash
- mkdir build
- cd build
- cmake ..
- cmake --build . --config Release
- ```
-
-- Using `Zig` (version 0.11 or later):
-
- Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
- it's also possible to cross compile for other operating systems and architectures:
-
- ```bash
- zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
- ```
-
- The `zig targets` command will give you valid options to use.
-
-- Using `gmake` (FreeBSD):
-
- 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
- 2. Add your user to **video** group
- 3. Install compilation dependencies.
-
- ```bash
- sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
- opencl clblast openblas
-
- gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
- ```
-
- **Notes:** With this packages you can build llama.cpp with OPENBLAS and
- CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
- the instructions for use and activate this options in this document below.
-
-### Metal Build
-
-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
-
-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
-argument.
-
-### MPI Build
-
-MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
-
-First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
-
-Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
-
-- Using `make`:
-
- ```bash
- make CC=mpicc CXX=mpicxx LLAMA_MPI=1
- ```
-
-- Using `CMake`:
-
- ```bash
- cmake -S . -B build -DLLAMA_MPI=ON
- ```
-
-Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
-
-Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
-
-Here is an example hostfile:
-
-```
-192.168.0.1:2
-malvolio.local:1
-```
-
-The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
-
-Finally, you're ready to run a computation using `mpirun`:
-
-```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
-```
-
-### BLAS Build
-
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
-
-- #### Accelerate Framework:
-
- This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
-
-- #### OpenBLAS:
-
- This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
-
- - Using `make`:
- - On Linux:
- ```bash
- make LLAMA_OPENBLAS=1
- ```
-
- - On Windows:
-
- 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
- 2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
- 3. Extract `w64devkit` on your pc.
- 4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
- 5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
- 6. Run `w64devkit.exe`.
- 7. Use the `cd` command to reach the `llama.cpp` folder.
- 8. From here you can run:
-
- ```bash
- make LLAMA_OPENBLAS=1
- ```
-
- - Using `CMake` on Linux:
-
- ```bash
- mkdir build
- cd build
- cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
- cmake --build . --config Release
- ```
-
-- #### BLIS
-
- Check [BLIS.md](docs/BLIS.md) for more information.
-
-- #### SYCL
- SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-
- llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-
- For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
-
-- #### Intel oneMKL
- Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
-
- - Using manual oneAPI installation:
- By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
- ```bash
- mkdir build
- cd build
- source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
- cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
- cmake --build . --config Release
- ```
-
- - Using oneAPI docker image:
- If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-
- Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
-
-- #### cuBLAS
-
- This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
- For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
- - Using `make`:
- ```bash
- make LLAMA_CUBLAS=1
- ```
- - Using `CMake`:
-
- ```bash
- mkdir build
- cd build
- cmake .. -DLLAMA_CUBLAS=ON
- cmake --build . --config Release
- ```
-
- The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
-
-
- | Option | Legal values | Default | Description |
- |--------------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
- | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
-
-- #### hipBLAS
-
- This provides BLAS acceleration on HIP-supported AMD GPUs.
- Make sure to have ROCm installed.
- You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
-
- - Using `make`:
- ```bash
- make LLAMA_HIPBLAS=1
- ```
- - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
- ```bash
- CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
- cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
- && cmake --build build -- -j 16
- ```
- On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
- However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
- - Using `make` (example for target gfx1030, build with 16 CPU threads):
- ```bash
- make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
- ```
-
- - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
- ```bash
- set PATH=%HIP_PATH%\bin;%PATH%
- mkdir build
- cd build
- cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
- cmake --build .
- ```
- Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
- Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-
-
- The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
- If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
- The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
-
- | Option | Legal values | Default | Description |
- |-------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-
-- #### CLBlast
-
- OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
-
- You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
-
- - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
- -
- Installing the OpenCL SDK from source
-
- ```sh
- git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
- mkdir OpenCL-SDK/build
- cd OpenCL-SDK/build
- cmake .. -DBUILD_DOCS=OFF \
- -DBUILD_EXAMPLES=OFF \
- -DBUILD_TESTING=OFF \
- -DOPENCL_SDK_BUILD_SAMPLES=OFF \
- -DOPENCL_SDK_TEST_SAMPLES=OFF
- cmake --build . --config Release
- cmake --install . --prefix /some/path
- ```
-
-
- ##### Installing CLBlast
-
- Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
- Alternatively, they may be built from source.
-
- -
- Windows:
-
- ```cmd
- set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
- git clone https://github.com/CNugteren/CLBlast.git
- mkdir CLBlast\build
- cd CLBlast\build
- cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
- cmake --build . --config Release
- cmake --install . --prefix C:/CLBlast
- ```
-
- -
- Unix:
-
- ```sh
- git clone https://github.com/CNugteren/CLBlast.git
- mkdir CLBlast/build
- cd CLBlast/build
- cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
- cmake --build . --config Release
- cmake --install . --prefix /some/path
- ```
-
- Where `/some/path` is where the built library will be installed (default is `/usr/local`).
-
-
- ##### Building Llama with CLBlast
-
- - Build with make:
- ```sh
- make LLAMA_CLBLAST=1
- ```
- - CMake (Unix):
- ```sh
- mkdir build
- cd build
- cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
- cmake --build . --config Release
- ```
- - CMake (Windows):
- ```cmd
- set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
- git clone https://github.com/ggerganov/llama.cpp
- cd llama.cpp
- mkdir build
- cd build
- cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
- cmake --build . --config Release
- cmake --install . --prefix C:/LlamaCPP
- ```
-
- ##### Running Llama with CLBlast
-
- The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
-
- To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
- The selection can be a number (starting from 0) or a text string to search:
-
- ```sh
- GGML_OPENCL_PLATFORM=1 ./main ...
- GGML_OPENCL_DEVICE=2 ./main ...
- GGML_OPENCL_PLATFORM=Intel ./main ...
- GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
- ```
-
- The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
- Using the variables it is possible to select a CPU-based driver as well, if so desired.
-
- You can get a list of platforms and devices from the `clinfo -l` command, etc.
-
-- #### Vulkan
-
- **With docker**:
-
- You don't need to install Vulkan SDK. It will be installed inside the container.
-
- ```sh
- # Build the image
- docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
-
- # Then, use it:
- docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
- ```
-
- **Without docker**:
-
- Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
-
- For example, on Ubuntu 22.04 (jammy), use the command below:
-
- ```bash
- wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
- wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
- apt update -y
- apt-get install -y vulkan-sdk
- # To verify the installation, use the command below:
- vulkaninfo
- ```
-
- Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
-
- Then, build llama.cpp using the cmake command below:
-
- ```bash
- mkdir -p build
- cd build
- cmake .. -DLLAMA_VULKAN=1
- cmake --build . --config Release
- # Test the output binary (with "-ngl 33" to offload all layers to GPU)
- ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
-
- # You should see in the output, ggml_vulkan detected your GPU. For example:
- # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
- ```
-
-### Prepare and Quantize
-
-To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-
-```bash
-# obtain the official LLaMA model weights and place them in ./models
-ls ./models
-llama-2-7b tokenizer_checklist.chk tokenizer.model
-# [Optional] for models using BPE tokenizers
-ls ./models
- vocab.json
-# [Optional] for PyTorch .bin models like Mistral-7B
-ls ./models
-
-
-# install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
-
-# [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
-
-# quantize the model to 4-bits (using Q4_K_M method)
-./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
-
-# update the gguf filetype to current version if older version is now unsupported
-./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
-```
-
-### Run the quantized model
-
-```bash
-# start inference on a gguf model
-./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
-```
-
-When running the larger models, make sure you have enough disk space to store all the intermediate files.
-
-### Running on Windows with prebuilt binaries
-
-You will find prebuilt Windows binaries on the release page.
-
-Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
-
-From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
-
-```
-.\main -m llama-2-7b.Q4_0.gguf -n 128
-```
-
-### Memory/Disk Requirements
-
-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
-
-| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-| 7B | 13 GB | 3.9 GB |
-| 13B | 24 GB | 7.8 GB |
-| 30B | 60 GB | 19.5 GB |
-| 65B | 120 GB | 38.5 GB |
-
-### Quantization
-
-Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
-
-*(outdated)*
-
-| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
-| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
-| 7B | ms/tok @ 4th | 127 | 55 | 54 | 76 | 83 | 72 |
-| 7B | ms/tok @ 8th | 122 | 43 | 45 | 52 | 56 | 67 |
-| 7B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
-| 13B | perplexity | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
-| 13B | file size | 25.0G | 6.8G | 7.6G | 8.3G | 9.1G | 13G |
-| 13B | ms/tok @ 4th | - | 103 | 105 | 148 | 160 | 131 |
-| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
-| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
-
-- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
-- recent k-quants improvements and new i-quants
- - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
- - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
- - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
- - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
- - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
- - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
- - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
- - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
- - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
- - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
- - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
- - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
- - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
- - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
-
-### Perplexity (measuring model quality)
-
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
-
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
-
-#### How to run
-
-1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
-3. Output:
-```
-perplexity : calculating perplexity over 655 chunks
-24.43 seconds per pass - ETA 4.45 hours
-[1]4.5970,[2]5.1807,[3]6.0382,...
-```
-And after 4.45 hours, you will have the final perplexity.
-
-### Interactive mode
-
-If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
-
-Here is an example of a few-shot interaction, invoked with the command
-
-```bash
-# default arguments using a 7B model
-./examples/chat.sh
-
-# advanced chat with a 13B model
-./examples/chat-13B.sh
-
-# custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
-```
-
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
-
-
-
-### Persistent Interaction
-
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
-
-```bash
-# Start a new chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Resume that chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Start a different chat with the same prompt/model
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
-
-# Different prompt cache for different prompt/model
-PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
- CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
-```
-
-### Constrained output with grammars
-
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
-
-```bash
-./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
-
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
-
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-
-### Instruct mode
-
-1. First, download and place the `ggml` model into the `./models` folder
-2. Run the `main` tool like this:
-
-```
-./examples/alpaca.sh
-```
-
-Sample run:
-
-```
-== Running in interactive mode. ==
- - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMa.
- - If you want to submit another line, end your input in '\'.
-
- Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-> How many letters are there in the English alphabet?
-There 26 letters in the English Alphabet
-> What is the most common way of transportation in Amsterdam?
-The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
-> List 5 words that start with "ca".
-cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
+> [!IMPORTANT]
+> Important: In order to use the library, one needs to set build parameters in the consuming Xcode project or the consuming SPM package to enable the [Swift / C++ Interop](https://www.swift.org/documentation/cxx-interop/), introduced in Xcode 15 and Swift 5.9. Keep in mind that this is true for nested dependencies, one needs to set this configuration recursivly for the entire dependency tree towards the llama.cpp SPM package.
+>
+> **For Xcode projects:**
+> - Open your project settings in Xcode by selecting *PROJECT_NAME > TARGET_NAME > Build Settings*.
+> - Within the *Build Settings*, search for the `C++ and Objective-C Interoperability` setting and set it to `C++ / Objective-C++`. This enables the project to use the C++ headers from llama.cpp.
>
+> **For SPM packages:**
+> - Open the `Package.swift` file of your SPM package
+> - Within the package `target` that consumes the llama.cpp package, add the `interoperabilityMode(_:)` Swift build setting like that:
+```swift
+/// Adds the dependency to the Stanford BDHG llama.cpp SPM package
+dependencies: [
+ .package(url: "https://github.com/StanfordBDHG/llama.cpp", .upToNextMinor(from: "0.1.0"))
+],
+targets: [
+ .target(
+ name: "ExampleConsumingTarget",
+ /// State the dependence of the target to llama.cpp
+ dependencies: [
+ .product(name: "llama", package: "llama.cpp")
+ ],
+ /// Important: Configure the `.interoperabilityMode(_:)` within the `swiftSettings`
+ swiftSettings: [
+ .interoperabilityMode(.Cxx)
+ ]
+ )
+]
```
-### Obtaining and using the Facebook LLaMA 2 model
+## Contributing
-- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
-- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
- - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
- - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
- - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
- - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
- - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
- - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
+Contributions to this project are welcome. Please make sure to read the [contribution guidelines](https://github.com/StanfordBDHG/.github/blob/main/CONTRIBUTING.md) and the [contributor covenant code of conduct](https://github.com/StanfordBDHG/.github/blob/main/CODE_OF_CONDUCT.md) first.
+You can find a list of contributors in the [`CONTRIBUTORS.md`](https://github.com/StanfordBDHG/llama.cpp/blob/main/CONTRIBUTORS.md) file.
-### Seminal papers and background on the models
+## License
-If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-- LLaMA:
- - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-- GPT-3
- - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-- GPT-3.5 / InstructGPT / ChatGPT:
- - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+This project is a fork of an existing project that is licensed under the MIT License, and all changes made in this fork continue to be under the MIT License. For more information about the license terms, see the [Licenses folder](https://github.com/StanfordBDHG/llama.cpp/blob/main/LICENSES).
-### Android
+## Our Research
-#### Building the Project using Android NDK
-You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
+For more information, check out our website at [biodesigndigitalhealth.stanford.edu](https://biodesigndigitalhealth.stanford.edu).
-First, install the essential packages for termux:
-```
-pkg install clang wget git cmake
-```
-Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
-```
-$ mkdir build-android
-$ cd build-android
-$ export NDK=
-$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-$ make
-```
-Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
-Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
-
-https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
-
-#### Building the Project using Termux (F-Droid)
-Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
-
-Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
-
-If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
-```
-apt install libopenblas
-```
-
-Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
-```
-apt install ocl-icd opencl-headers opencl-clhpp clinfo
-```
-
-In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
-```
-cmake .
-make
-cp libclblast.so* $PREFIX/lib
-cp ./include/clblast.h ../llama.cpp
-```
-
-Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
-```
-cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
-cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
-make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
-```
-
-Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
-```
-GGML_OPENCL_PLATFORM=0
-GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
-```
-
-(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
-
-For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
-
-Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
-
-### Docker
-
-#### Prerequisites
-* Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (ex. /llama/models)
-
-#### Images
-We have three Docker images available for this project:
-
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
-
-Additionally, there the following images, similar to the above:
-
-- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
-
-#### Usage
-
-The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
-
-Replace `/path/to/models` below with the actual path where you downloaded the models.
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
-```
-
-On completion, you are ready to play!
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a light image:
-
-```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
-```
-
-or with a server image:
-
-```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
-```
-
-### Docker With CUDA
-
-Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
-
-#### Building Locally
-
-```bash
-docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
-docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
-docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
-```
-
-You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
-
-The defaults are:
-
-- `CUDA_VERSION` set to `11.7.1`
-- `CUDA_DOCKER_ARCH` set to `all`
-
-The resulting images, are essentially the same as the non-CUDA images:
-
-1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
-2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
-3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
-
-#### Usage
-
-After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
-
-```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
-```
-
-### Contributing
-
-- Contributors can open PRs
-- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
-- Collaborators will be invited based on contributions
-- Any help with managing issues and PRs is very appreciated!
-- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
-- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-
-### Coding guidelines
-
-- Avoid adding third-party dependencies, extra files, extra headers, etc.
-- Always consider cross-compatibility with other operating systems and architectures
-- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
-- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
-- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
-- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
-- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
-- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
-
-### Docs
-
-- [main](./examples/main/README.md)
-- [server](./examples/server/README.md)
-- [jeopardy](./examples/jeopardy/README.md)
-- [BLIS](./docs/BLIS.md)
-- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
-- [GBNF grammars](./grammars/README.md)
+
+
\ No newline at end of file
diff --git a/common/common.h b/common/common.h
index 25003df26..2a3597fcc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -32,10 +32,22 @@
} while(0)
// build info
+// If macro is not defined (passed via Xcode), set the corresponding variable as defined via external variable.
+#ifndef LLAMA_BUILD_NUMBER
extern int LLAMA_BUILD_NUMBER;
+#endif
+
+#ifndef LLAMA_COMMIT
extern char const *LLAMA_COMMIT;
+#endif
+
+#ifndef LLAMA_COMPILER
extern char const *LLAMA_COMPILER;
+#endif
+
+#ifndef LLAMA_BUILD_TARGET
extern char const *LLAMA_BUILD_TARGET;
+#endif
//
// CLI argument parsing
diff --git a/kompute b/kompute
index 4565194ed..d1e3b0953 160000
--- a/kompute
+++ b/kompute
@@ -1 +1 @@
-Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306
+Subproject commit d1e3b0953cf66acc94b2e29693e221427b2c1f3f
diff --git a/llama-module.modulemap b/llama-module.modulemap
new file mode 100644
index 000000000..384d4f163
--- /dev/null
+++ b/llama-module.modulemap
@@ -0,0 +1,21 @@
+//
+// This source file is part of the Stanford Biodesign Digital Health Group open-source organization
+//
+// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md)
+//
+// SPDX-License-Identifier: MIT
+//
+
+framework module llama {
+ umbrella header "llama.h"
+
+ header "common.h"
+ header "sampling.h"
+ header "log.h"
+ header "grammar-parser.h"
+ header "tokenize.h"
+ header "vector.h"
+
+ export *
+ module * { export * }
+}
diff --git a/llama.xcframework/Info.plist b/llama.xcframework/Info.plist
new file mode 100644
index 000000000..d9c78a472
--- /dev/null
+++ b/llama.xcframework/Info.plist
@@ -0,0 +1,44 @@
+
+
+
+
+ AvailableLibraries
+
+
+ BinaryPath
+ llama.framework/llama
+ LibraryIdentifier
+ ios-arm64
+ LibraryPath
+ llama.framework
+ SupportedArchitectures
+
+ arm64
+
+ SupportedPlatform
+ ios
+
+
+ BinaryPath
+ llama.framework/llama
+ LibraryIdentifier
+ ios-arm64_x86_64-simulator
+ LibraryPath
+ llama.framework
+ SupportedArchitectures
+
+ arm64
+ x86_64
+
+ SupportedPlatform
+ ios
+ SupportedPlatformVariant
+ simulator
+
+
+ CFBundlePackageType
+ XFWK
+ XCFrameworkFormatVersion
+ 1.0
+
+
diff --git a/llama.xcframework/ios-arm64/llama.framework/Headers/common.h b/llama.xcframework/ios-arm64/llama.framework/Headers/common.h
new file mode 100644
index 000000000..c782995e8
--- /dev/null
+++ b/llama.xcframework/ios-arm64/llama.framework/Headers/common.h
@@ -0,0 +1,270 @@
+// Various helper functions and utilities
+
+#pragma once
+
+#include "llama.h"
+
+#include "sampling.h"
+
+#define LOG_NO_FILE_LINE_FUNCTION
+#include "log.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef _WIN32
+#define DIRECTORY_SEPARATOR '\\'
+#else
+#define DIRECTORY_SEPARATOR '/'
+#endif // _WIN32
+
+#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+#define print_build_info() do { \
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
+ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
+} while(0)
+
+// build info
+// If macro is not defined (passed via Xcode), set the corresponding variable as defined via external variable.
+#ifndef LLAMA_BUILD_NUMBER
+extern int LLAMA_BUILD_NUMBER;
+#endif
+
+#ifndef LLAMA_COMMIT
+extern char const *LLAMA_COMMIT;
+#endif
+
+#ifndef LLAMA_COMPILER
+extern char const *LLAMA_COMPILER;
+#endif
+
+#ifndef LLAMA_BUILD_TARGET
+extern char const *LLAMA_BUILD_TARGET;
+#endif
+
+//
+// CLI argument parsing
+//
+int32_t get_num_physical_cores();
+
+struct gpt_params {
+ uint32_t seed = -1; // RNG seed
+
+ int32_t n_threads = get_num_physical_cores();
+ int32_t n_threads_draft = -1;
+ int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
+ int32_t n_threads_batch_draft = -1;
+ int32_t n_predict = -1; // new tokens to predict
+ int32_t n_ctx = 512; // context size
+ int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_draft = 8; // number of tokens to draft during speculative decoding
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
+ int32_t n_parallel = 1; // number of parallel sequences to decode
+ int32_t n_sequences = 1; // number of sequences to decode
+ float p_accept = 0.5f; // speculative decoding accept probability
+ float p_split = 0.1f; // speculative decoding split probability
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
+ int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+ llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
+ float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
+ int32_t n_beams = 0; // if non-zero then use beam search of given width.
+ int32_t grp_attn_n = 1; // group-attention factor
+ int32_t grp_attn_w = 512; // group-attention width
+ int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
+ float rope_freq_base = 0.0f; // RoPE base frequency
+ float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
+ float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
+ float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
+ float yarn_beta_fast = 32.0f; // YaRN low correction dim
+ float yarn_beta_slow = 1.0f; // YaRN high correction dim
+ int32_t yarn_orig_ctx = 0; // YaRN original context length
+ int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
+ // pinging @cebtenzzre
+
+ // // sampling parameters
+ struct llama_sampling_params sparams;
+
+ std::string model = "models/7B/ggml-model-f16.gguf"; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
+ std::string model_alias = "unknown"; // model alias
+ std::string prompt = "";
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
+ std::vector antiprompt; // string upon seeing which more user input is prompted
+ std::string logdir = ""; // directory in which to save YAML log files
+ std::string logits_file = ""; // file for saving *all* logits
+
+ std::vector kv_overrides;
+
+ // TODO: avoid tuple, use struct
+ std::vector> lora_adapter; // lora adapter path with user defined scale
+ std::string lora_base = ""; // base model path for the lora adapter
+
+ int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ // (which is more convenient to use for plotting)
+ //
+ bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+ size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
+
+ bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+ size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+
+ bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+ size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
+ bool kl_divergence = false; // compute KL-divergence
+
+ bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
+ bool random_prompt = false; // do not randomize prompt if none provided
+ bool use_color = false; // use color to distinguish generations and inputs
+ bool interactive = false; // interactive mode
+ bool chatml = false; // chatml mode (used for models trained on chatml syntax)
+ bool prompt_cache_all = false; // save user input and generations to prompt cache
+ bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
+
+ bool embedding = false; // get only sentence embedding
+ bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+ bool interactive_first = false; // wait for user input immediately
+ bool multiline_input = false; // reverse the usage of `\`
+ bool simple_io = false; // improves compatibility with subprocesses and limited consoles
+ bool cont_batching = false; // insert new sequences for decoding on-the-fly
+
+ bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
+ bool ignore_eos = false; // ignore generated EOS tokens
+ bool instruct = false; // instruction mode (used for Alpaca models)
+ bool logits_all = false; // return logits for all tokens in the batch
+ bool use_mmap = true; // use mmap for faster loads
+ bool use_mlock = false; // use mlock to keep model in memory
+ bool numa = false; // attempt optimizations that help on some NUMA systems
+ bool verbose_prompt = false; // print prompt tokens before generation
+ bool display_prompt = true; // print prompt before generation
+ bool infill = false; // use infill mode
+ bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
+ bool no_kv_offload = false; // disable KV offloading
+
+ std::string cache_type_k = "f16"; // KV cache data type for the K
+ std::string cache_type_v = "f16"; // KV cache data type for the V
+
+ // multimodal models (see examples/llava)
+ std::string mmproj = ""; // path to multimodal projector
+ std::string image = ""; // path to an image file
+};
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string get_system_info(const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
+//
+// Model utils
+//
+
+// TODO: avoid tuplue, use struct
+std::tuple llama_init_from_gpt_params(gpt_params & params);
+
+struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+
+// Batch utils
+
+void llama_batch_clear(struct llama_batch & batch);
+
+void llama_batch_add(
+ struct llama_batch & batch,
+ llama_token id,
+ llama_pos pos,
+ const std::vector & seq_ids,
+ bool logits);
+
+//
+// Vocab utils
+//
+
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
+std::vector llama_tokenize(
+ const struct llama_context * ctx,
+ const std::string & text,
+ bool add_bos,
+ bool special = false);
+
+std::vector llama_tokenize(
+ const struct llama_model * model,
+ const std::string & text,
+ bool add_bos,
+ bool special = false);
+
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string llama_token_to_piece(
+ const struct llama_context * ctx,
+ llama_token token);
+
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+// that takes into account the tokenizer type and decides how to handle the leading space
+//
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
+ llama_context * ctx,
+ const std::vector & tokens);
+
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+ llama_context * ctx,
+ const std::vector & tokens);
+
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
+//
+// YAML utils
+//
+
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
+
+void dump_non_result_info_yaml(
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
+ const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc);
+
+//
+// KV cache utils
+//
+
+// Dump the KV cache view with the number of sequences per cell.
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+
+// Dump the KV cache view showing individual sequences in each cell (long output).
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
diff --git a/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-alloc.h b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-alloc.h
new file mode 100644
index 000000000..4e5997521
--- /dev/null
+++ b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-alloc.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_backend;
+struct ggml_backend_buffer;
+struct ggml_backend_buffer_type;
+
+//
+// Legacy API
+//
+
+typedef struct ggml_allocr * ggml_allocr_t;
+
+// initialize allocator for use with CPU backend only
+GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
+
+// initialize allocator for use with ggml-backend
+GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
+
+GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
+GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
+GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
+GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
+
+GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
+
+//
+// ggml-backend v2 API
+//
+
+// Separate tensor and graph allocator objects
+// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
+// The original API is kept as a wrapper around the new API
+
+// Tensor allocator
+typedef struct ggml_tallocr * ggml_tallocr_t;
+
+GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
+
+GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
+GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
+GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
+GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
+
+
+// Graph allocator
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(void);
+GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
+
+GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
+GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
+
+// Allocate tensors from the allocators given by the hash table
+GGML_API void ggml_gallocr_alloc_graph_n(
+ ggml_gallocr_t galloc,
+ struct ggml_cgraph * graph,
+ struct ggml_hash_set hash_set,
+ ggml_tallocr_t * hash_node_talloc);
+
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h
new file mode 100644
index 000000000..ab4ad773f
--- /dev/null
+++ b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+ typedef struct ggml_backend * ggml_backend_t;
+ typedef void * ggml_backend_graph_plan_t;
+
+ //
+ // Backend buffer
+ //
+
+ // buffer type
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+ GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+ GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
+
+ // buffer
+ enum ggml_backend_buffer_usage {
+ GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+ GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+ };
+
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
+ GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
+
+ //
+ // Backend
+ //
+
+
+ GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+ GGML_API void ggml_backend_free(ggml_backend_t backend);
+
+ GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+ GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+ GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+
+ GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+
+ GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+
+ GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
+
+ // tensor copy between different backends
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+ GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
+
+ //
+ // CPU backend
+ //
+
+ GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+ GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
+ GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
+ // Create a backend buffer from an existing pointer
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef GGML_USE_CPU_HBM
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
+ //
+ // Backend registry
+ //
+
+ // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+ GGML_API size_t ggml_backend_reg_get_count(void);
+ GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+ GGML_API const char * ggml_backend_reg_get_name(size_t i);
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+ GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+ GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+
+ //
+ // Backend scheduler
+ //
+
+ // The backend scheduler allows for multiple backends to be used together
+ // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+ // The backends are selected based on:
+ // - the backend that supports the operation
+ // - the location of the pre-allocated tensors (e.g. the weights)
+ /*
+ Example usage:
+
+ sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
+ // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
+
+ // initialize buffers from a measure graph
+ measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
+
+ // in build_graph:
+ build_graph(...) {
+ // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
+ alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
+ ggml_allocr_alloc(alloc_cpu, tensor);
+
+ // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+ struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+ ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
+ }
+
+ // allocate backend buffers from measure graph
+ ggml_backend_sched_init_measure(sched, measure_graph);
+
+ // the scheduler is now ready to compute graphs
+
+ // compute
+ graph = build_graph(sched);
+ ggml_backend_sched_graph_compute(sched, graph);
+ */
+
+ struct ggml_backend_sched;
+ typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+ // when ask == true, the scheduler wants to know if the user wants to observe this node
+ // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+ //
+ // when ask == false, the scheduler is passing the node tensor to the user for observation
+ // if the user returns false, the scheduler will cancel the graph compute
+ //
+ typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+ // Initialize a backend scheduler
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+ // Initialize backend buffers from a measure graph
+ GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+ // Get the number of splits of the last graph
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+
+ GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
+
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+ GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+ // Allocate and compute graph on the backend scheduler
+ GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+
+ // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+ // Set a callback to be called for each resulting node during graph compute
+ GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+ //
+ // Utils
+ //
+
+ struct ggml_backend_graph_copy {
+ ggml_backend_buffer_t buffer;
+ struct ggml_context * ctx_allocated;
+ struct ggml_context * ctx_unallocated;
+ struct ggml_cgraph * graph;
+ };
+
+ // Copy a graph to a different backend
+ GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+ GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+ typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+ // Compare the output of two backends
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+ // Tensor initialization
+ GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+ GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h
new file mode 100644
index 000000000..1c4976271
--- /dev/null
+++ b/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h
@@ -0,0 +1,2301 @@
+#pragma once
+
+//
+// GGML Tensor Library
+//
+// This documentation is still a work in progress.
+// If you wish some specific topics to be covered, feel free to drop a comment:
+//
+// https://github.com/ggerganov/whisper.cpp/issues/40
+//
+// ## Overview
+//
+// This library implements:
+//
+// - a set of tensor operations
+// - automatic differentiation
+// - basic optimization algorithms
+//
+// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
+// but is not limited to, the following:
+//
+// - linear regression
+// - support vector machines
+// - neural networks
+//
+// The library allows the user to define a certain function using the available tensor operations. This function
+// definition is represented internally via a computation graph. Each tensor operation in the function definition
+// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+// using one of the available optimization algorithms.
+//
+// For example, here we define the function: f(x) = a*x^2 + b
+//
+// {
+// struct ggml_init_params params = {
+// .mem_size = 16*1024*1024,
+// .mem_buffer = NULL,
+// };
+//
+// // memory allocation happens here
+// struct ggml_context * ctx = ggml_init(params);
+//
+// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//
+// ggml_set_param(ctx, x); // x is an input variable
+//
+// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
+// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
+//
+// ...
+// }
+//
+// Notice that the function definition above does not involve any actual computation. The computation is performed only
+// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+//
+// {
+// ...
+//
+// struct ggml_cgraph * gf = ggml_new_graph(ctx);
+// ggml_build_forward_expand(gf, f);
+//
+// // set the input variable and parameter values
+// ggml_set_f32(x, 2.0f);
+// ggml_set_f32(a, 3.0f);
+// ggml_set_f32(b, 4.0f);
+//
+// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
+//
+// printf("f = %f\n", ggml_get_f32_1d(f, 0));
+//
+// ...
+// }
+//
+// The actual computation is performed in the ggml_graph_compute() function.
+//
+// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
+// actually needed.
+//
+// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
+// differentiation and optimization algorithms.
+//
+// The described approach allows to define the function graph once and then compute its forward or backward graphs
+// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
+// the user can avoid the memory allocation overhead at runtime.
+//
+// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+// citizens, but in theory the library can be extended to support FP8 and integer data types.
+//
+// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+// clear that the library needs to support more complex operations. The way to support these operations is not clear
+// yet, but a few examples are demonstrated in the following operations:
+//
+// - ggml_permute()
+// - ggml_conv_1d_1s()
+// - ggml_conv_1d_2s()
+//
+// For each tensor operator, the library implements a forward and backward computation function. The forward function
+// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+// calculus class, or watch the following video:
+//
+// What is Automatic Differentiation?
+// https://www.youtube.com/watch?v=wG_nF1awSSY
+//
+//
+// ## Tensor data (struct ggml_tensor)
+//
+// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
+// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+//
+// {
+// struct ggml_tensor * c = ggml_add(ctx, a, b);
+//
+// assert(c->src[0] == a);
+// assert(c->src[1] == b);
+// }
+//
+// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
+// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+// contiguous in memory.
+//
+// The data of the tensor is accessed via the "data" pointer. For example:
+//
+// {
+// const int nx = 2;
+// const int ny = 3;
+//
+// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
+//
+// for (int y = 0; y < ny; y++) {
+// for (int x = 0; x < nx; x++) {
+// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
+// }
+// }
+//
+// ...
+// }
+//
+// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
+//
+// ## The matrix multiplication operator (ggml_mul_mat)
+//
+// TODO
+//
+//
+// ## Multi-threading
+//
+// TODO
+//
+//
+// ## Overview of ggml.c
+//
+// TODO
+//
+//
+// ## SIMD optimizations
+//
+// TODO
+//
+//
+// ## Debugging ggml
+//
+// TODO
+//
+//
+
+#ifdef GGML_SHARED
+# if defined(_WIN32) && !defined(__MINGW32__)
+# ifdef GGML_BUILD
+# define GGML_API __declspec(dllexport)
+# else
+# define GGML_API __declspec(dllimport)
+# endif
+# else
+# define GGML_API __attribute__ ((visibility ("default")))
+# endif
+#else
+# define GGML_API
+#endif
+
+#ifdef GGML_MULTIPLATFORM
+# if defined(_WIN32)
+# define GGML_CALL
+# else
+# define GGML_CALL __attribute__((__ms_abi__))
+# endif
+#else
+# define GGML_CALL
+#endif
+
+// TODO: support for clang
+#ifdef __GNUC__
+# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+# define GGML_DEPRECATED(func, hint) func
+#endif
+
+#ifndef __GNUC__
+# define GGML_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+
+#include
+#include
+#include
+
+#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
+#define GGML_FILE_VERSION 1
+
+#define GGML_QNT_VERSION 2 // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
+#define GGML_MAX_DIMS 4
+#define GGML_MAX_PARAMS 2048
+#define GGML_MAX_CONTEXTS 64
+#define GGML_MAX_SRC 10
+#ifndef GGML_MAX_NAME
+#define GGML_MAX_NAME 64
+#endif
+#define GGML_MAX_OP_PARAMS 64
+#define GGML_DEFAULT_N_THREADS 4
+#define GGML_DEFAULT_GRAPH_SIZE 2048
+#if UINTPTR_MAX == 0xFFFFFFFF
+ #define GGML_MEM_ALIGN 4
+#else
+ #define GGML_MEM_ALIGN 16
+#endif
+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+#define GGML_UNUSED(x) (void)(x)
+
+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ fflush(stdout); \
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+ ggml_print_backtrace(); \
+ abort(); \
+ } \
+ } while (0)
+
+#ifndef NDEBUG
+#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_UNREACHABLE() __assume(0)
+#else
+#define GGML_UNREACHABLE() ((void) 0)
+#endif
+
+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
+//
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+ const type prefix##0 = (pointer)->array[0]; \
+ GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
+ const type prefix##1 = (pointer)->array[1]; \
+ GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
+ const type prefix##2 = (pointer)->array[2]; \
+ GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
+ const type prefix##3 = (pointer)->array[3]; \
+ GGML_UNUSED(prefix##3);
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+ typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+ typedef __fp16 ggml_fp16_t;
+#else
+ typedef uint16_t ggml_fp16_t;
+#endif
+
+ // convert FP16 <-> FP32
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
+
+ struct ggml_object;
+ struct ggml_context;
+
+ enum ggml_type {
+ GGML_TYPE_F32 = 0,
+ GGML_TYPE_F16 = 1,
+ GGML_TYPE_Q4_0 = 2,
+ GGML_TYPE_Q4_1 = 3,
+ // GGML_TYPE_Q4_2 = 4, support has been removed
+ // GGML_TYPE_Q4_3 (5) support has been removed
+ GGML_TYPE_Q5_0 = 6,
+ GGML_TYPE_Q5_1 = 7,
+ GGML_TYPE_Q8_0 = 8,
+ GGML_TYPE_Q8_1 = 9,
+ // k-quantizations
+ GGML_TYPE_Q2_K = 10,
+ GGML_TYPE_Q3_K = 11,
+ GGML_TYPE_Q4_K = 12,
+ GGML_TYPE_Q5_K = 13,
+ GGML_TYPE_Q6_K = 14,
+ GGML_TYPE_Q8_K = 15,
+ GGML_TYPE_IQ2_XXS = 16,
+ GGML_TYPE_IQ2_XS = 17,
+ GGML_TYPE_I8,
+ GGML_TYPE_I16,
+ GGML_TYPE_I32,
+ GGML_TYPE_COUNT,
+ };
+
+ // precision
+ enum ggml_prec {
+ GGML_PREC_DEFAULT,
+ GGML_PREC_F32,
+ };
+
+ enum ggml_backend_type {
+ GGML_BACKEND_CPU = 0,
+ GGML_BACKEND_GPU = 10,
+ GGML_BACKEND_GPU_SPLIT = 20,
+ };
+
+ // model file types
+ enum ggml_ftype {
+ GGML_FTYPE_UNKNOWN = -1,
+ GGML_FTYPE_ALL_F32 = 0,
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+ GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
+ GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
+ };
+
+ // available tensor operations:
+ enum ggml_op {
+ GGML_OP_NONE = 0,
+
+ GGML_OP_DUP,
+ GGML_OP_ADD,
+ GGML_OP_ADD1,
+ GGML_OP_ACC,
+ GGML_OP_SUB,
+ GGML_OP_MUL,
+ GGML_OP_DIV,
+ GGML_OP_SQR,
+ GGML_OP_SQRT,
+ GGML_OP_LOG,
+ GGML_OP_SUM,
+ GGML_OP_SUM_ROWS,
+ GGML_OP_MEAN,
+ GGML_OP_ARGMAX,
+ GGML_OP_REPEAT,
+ GGML_OP_REPEAT_BACK,
+ GGML_OP_CONCAT,
+ GGML_OP_SILU_BACK,
+ GGML_OP_NORM, // normalize
+ GGML_OP_RMS_NORM,
+ GGML_OP_RMS_NORM_BACK,
+ GGML_OP_GROUP_NORM,
+
+ GGML_OP_MUL_MAT,
+ GGML_OP_MUL_MAT_ID,
+ GGML_OP_OUT_PROD,
+
+ GGML_OP_SCALE,
+ GGML_OP_SET,
+ GGML_OP_CPY,
+ GGML_OP_CONT,
+ GGML_OP_RESHAPE,
+ GGML_OP_VIEW,
+ GGML_OP_PERMUTE,
+ GGML_OP_TRANSPOSE,
+ GGML_OP_GET_ROWS,
+ GGML_OP_GET_ROWS_BACK,
+ GGML_OP_DIAG,
+ GGML_OP_DIAG_MASK_INF,
+ GGML_OP_DIAG_MASK_ZERO,
+ GGML_OP_SOFT_MAX,
+ GGML_OP_SOFT_MAX_BACK,
+ GGML_OP_ROPE,
+ GGML_OP_ROPE_BACK,
+ GGML_OP_ALIBI,
+ GGML_OP_CLAMP,
+ GGML_OP_CONV_TRANSPOSE_1D,
+ GGML_OP_IM2COL,
+ GGML_OP_CONV_TRANSPOSE_2D,
+ GGML_OP_POOL_1D,
+ GGML_OP_POOL_2D,
+ GGML_OP_UPSCALE, // nearest interpolate
+ GGML_OP_PAD,
+ GGML_OP_ARGSORT,
+ GGML_OP_LEAKY_RELU,
+
+ GGML_OP_FLASH_ATTN,
+ GGML_OP_FLASH_FF,
+ GGML_OP_FLASH_ATTN_BACK,
+ GGML_OP_WIN_PART,
+ GGML_OP_WIN_UNPART,
+ GGML_OP_GET_REL_POS,
+ GGML_OP_ADD_REL_POS,
+
+ GGML_OP_UNARY,
+
+ GGML_OP_MAP_UNARY,
+ GGML_OP_MAP_BINARY,
+
+ GGML_OP_MAP_CUSTOM1_F32,
+ GGML_OP_MAP_CUSTOM2_F32,
+ GGML_OP_MAP_CUSTOM3_F32,
+
+ GGML_OP_MAP_CUSTOM1,
+ GGML_OP_MAP_CUSTOM2,
+ GGML_OP_MAP_CUSTOM3,
+
+ GGML_OP_CROSS_ENTROPY_LOSS,
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+
+ GGML_OP_COUNT,
+ };
+
+ enum ggml_unary_op {
+ GGML_UNARY_OP_ABS,
+ GGML_UNARY_OP_SGN,
+ GGML_UNARY_OP_NEG,
+ GGML_UNARY_OP_STEP,
+ GGML_UNARY_OP_TANH,
+ GGML_UNARY_OP_ELU,
+ GGML_UNARY_OP_RELU,
+ GGML_UNARY_OP_GELU,
+ GGML_UNARY_OP_GELU_QUICK,
+ GGML_UNARY_OP_SILU,
+ GGML_UNARY_OP_HARDSWISH,
+ GGML_UNARY_OP_HARDSIGMOID,
+
+ GGML_UNARY_OP_COUNT,
+ };
+
+ enum ggml_object_type {
+ GGML_OBJECT_TENSOR,
+ GGML_OBJECT_GRAPH,
+ GGML_OBJECT_WORK_BUFFER
+ };
+
+ enum ggml_log_level {
+ GGML_LOG_LEVEL_ERROR = 2,
+ GGML_LOG_LEVEL_WARN = 3,
+ GGML_LOG_LEVEL_INFO = 4,
+ GGML_LOG_LEVEL_DEBUG = 5
+ };
+
+ // ggml object
+ struct ggml_object {
+ size_t offs;
+ size_t size;
+
+ struct ggml_object * next;
+
+ enum ggml_object_type type;
+
+ char padding[4];
+ };
+
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
+ // n-dimensional tensor
+ struct ggml_tensor {
+ enum ggml_type type;
+ enum ggml_backend_type backend;
+
+ struct ggml_backend_buffer * buffer;
+
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
+ // nb[0] = ggml_type_size(type)
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
+ // nb[i] = nb[i-1] * ne[i-1]
+
+ // compute data
+ enum ggml_op op;
+
+ // op params - allocated as int32_t for alignment
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+
+ bool is_param;
+
+ struct ggml_tensor * grad;
+ struct ggml_tensor * src[GGML_MAX_SRC];
+
+ // performance
+ int perf_runs;
+ int64_t perf_cycles;
+ int64_t perf_time_us;
+
+ struct ggml_tensor * view_src;
+ size_t view_offs;
+
+ void * data;
+
+ char name[GGML_MAX_NAME];
+
+ void * extra; // extra things e.g. for ggml-cuda.cu
+
+ char padding[8];
+ };
+
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
+ // the compute plan that needs to be prepared for ggml_graph_compute()
+ // since https://github.com/ggerganov/ggml/issues/287
+ struct ggml_cplan {
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+ int n_threads;
+
+ // abort ggml_graph_compute when true
+ bool (*abort_callback)(void * data);
+ void * abort_callback_data;
+ };
+
+ enum ggml_cgraph_eval_order {
+ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+ GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+ GGML_CGRAPH_EVAL_ORDER_COUNT
+ };
+
+ struct ggml_hash_set {
+ size_t size;
+ struct ggml_tensor ** keys;
+ };
+
+ // computation graph
+ struct ggml_cgraph {
+ int size;
+ int n_nodes;
+ int n_leafs;
+
+ struct ggml_tensor ** nodes;
+ struct ggml_tensor ** grads;
+ struct ggml_tensor ** leafs;
+
+ struct ggml_hash_set visited_hash_table;
+
+ enum ggml_cgraph_eval_order order;
+
+ // performance
+ int perf_runs;
+ int64_t perf_cycles;
+ int64_t perf_time_us;
+ };
+
+ // scratch buffer
+ struct ggml_scratch {
+ size_t offs;
+ size_t size;
+ void * data;
+ };
+
+ struct ggml_init_params {
+ // memory pool
+ size_t mem_size; // bytes
+ void * mem_buffer; // if NULL, memory will be allocated internally
+ bool no_alloc; // don't allocate memory for the tensor data
+ };
+
+
+ // compute types
+
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
+ enum ggml_task_type {
+ GGML_TASK_INIT = 0,
+ GGML_TASK_COMPUTE,
+ GGML_TASK_FINALIZE,
+ };
+
+ struct ggml_compute_params {
+ enum ggml_task_type type;
+
+ // ith = thread index, nth = number of threads
+ int ith, nth;
+
+ // work buffer for all threads
+ size_t wsize;
+ void * wdata;
+ };
+
+ // misc
+
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
+ GGML_API int64_t ggml_time_ms(void);
+ GGML_API int64_t ggml_time_us(void);
+ GGML_API int64_t ggml_cycles(void);
+ GGML_API int64_t ggml_cycles_per_ms(void);
+
+ GGML_API void ggml_print_backtrace(void);
+
+ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
+ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
+
+ GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+
+ GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+
+ GGML_DEPRECATED(
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+ "use ggml_row_size() instead");
+
+ GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
+ GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
+
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+ GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+
+ GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
+
+ GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
+
+ // TODO: temporary until model loading of ggml examples is refactored
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
+ GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
+
+ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+ // use this to compute the memory overhead of a tensor
+ GGML_API size_t ggml_tensor_overhead(void);
+
+ // main
+
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+ GGML_API void ggml_free(struct ggml_context * ctx);
+
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
+
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
+
+ GGML_API struct ggml_tensor * ggml_new_tensor(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int n_dims,
+ const int64_t *ne);
+
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int64_t ne0);
+
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int64_t ne0,
+ int64_t ne1);
+
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2);
+
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3);
+
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
+
+ // Context tensor enumeration and lookup
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+ // Converts a flat index into coordinates
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+ GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+ GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+ GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+ GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+
+ GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
+ GGML_ATTRIBUTE_FORMAT(2, 3)
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
+
+ //
+ // operations on tensors with backpropagation
+ //
+
+ GGML_API struct ggml_tensor * ggml_dup(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_add(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_add_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_add_cast(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ enum ggml_type type);
+
+ GGML_API struct ggml_tensor * ggml_add1(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // dst = a
+ // view(dst, nb1, nb2, nb3, offset) += b
+ // return dst
+ GGML_API struct ggml_tensor * ggml_acc(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_sub(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_mul(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_div(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_div_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_sqr(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_sqrt(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_log(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_log_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // return scalar
+ GGML_API struct ggml_tensor * ggml_sum(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+ GGML_API struct ggml_tensor * ggml_sum_rows(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // mean along rows
+ GGML_API struct ggml_tensor * ggml_mean(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // argmax along rows
+ GGML_API struct ggml_tensor * ggml_argmax(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // if a is the same shape as b, and a is not parameter, return a
+ // otherwise, return a new tensor: repeat(a) to fit in b
+ GGML_API struct ggml_tensor * ggml_repeat(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // sums repetitions in a into shape of b
+ GGML_API struct ggml_tensor * ggml_repeat_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // concat a and b on dim 2
+ // used in stable-diffusion
+ GGML_API struct ggml_tensor * ggml_concat(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_abs(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_sgn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_neg(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_step(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_step_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_tanh(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_elu(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_relu(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a, float negative_slope, bool inplace);
+
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_gelu(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_silu(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // a - x
+ // b - dy
+ GGML_API struct ggml_tensor * ggml_silu_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // hardswish(x) = x * relu6(x + 3) / 6
+ GGML_API struct ggml_tensor * ggml_hardswish(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // hardsigmoid(x) = relu6(x + 3) / 6
+ GGML_API struct ggml_tensor * ggml_hardsigmoid(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // normalize along rows
+ GGML_API struct ggml_tensor * ggml_norm(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float eps);
+
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float eps);
+
+ GGML_API struct ggml_tensor * ggml_rms_norm(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float eps);
+
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float eps);
+
+ // group normalize along ne0*ne1*n_groups
+ // used in stable-diffusion
+ // TODO: eps is hardcoded to 1e-6 for now
+ GGML_API struct ggml_tensor * ggml_group_norm(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_groups);
+
+ GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_groups);
+
+ // a - x
+ // b - dy
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ float eps);
+
+ // A: k columns, n rows => [ne03, ne02, n, k]
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
+ GGML_API struct ggml_tensor * ggml_mul_mat(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // change the precision of a matrix multiplication
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+ GGML_API void ggml_mul_mat_set_prec(
+ struct ggml_tensor * a,
+ enum ggml_prec prec);
+
+ // indirect matrix multiplication
+ // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
+ GGML_API struct ggml_tensor * ggml_mul_mat_id(
+ struct ggml_context * ctx,
+ struct ggml_tensor * const as[],
+ int n_as,
+ struct ggml_tensor * ids,
+ int id,
+ struct ggml_tensor * b);
+
+ // A: m columns, n rows,
+ // B: p columns, n rows,
+ // result is m columns, p rows
+ GGML_API struct ggml_tensor * ggml_out_prod(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ //
+ // operations on tensors without backpropagation
+ //
+
+ GGML_API struct ggml_tensor * ggml_scale(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float s);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float s);
+
+ // b -> view(a,offset,nb1,nb2,3), return modified a
+ GGML_API struct ggml_tensor * ggml_set(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
+ GGML_API struct ggml_tensor * ggml_set_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t nb2,
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_set_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return modified a
+ GGML_API struct ggml_tensor * ggml_set_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset);
+
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ size_t nb1,
+ size_t offset);
+
+ // a -> b, return view(b)
+ GGML_API struct ggml_tensor * ggml_cpy(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_cast(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_type type);
+
+ // make contiguous
+ GGML_API struct ggml_tensor * ggml_cont(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // make contiguous, with new shape
+ GGML_API struct ggml_tensor * ggml_cont_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0);
+
+ GGML_API struct ggml_tensor * ggml_cont_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1);
+
+ GGML_API struct ggml_tensor * ggml_cont_3d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2);
+
+ GGML_API struct ggml_tensor * ggml_cont_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3);
+
+ // return view(a), b specifies the new shape
+ // TODO: when we start computing gradient, make a copy instead of view
+ GGML_API struct ggml_tensor * ggml_reshape(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // return view(a)
+ // TODO: when we start computing gradient, make a copy instead of view
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0);
+
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1);
+
+ // return view(a)
+ // TODO: when we start computing gradient, make a copy instead of view
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2);
+
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3);
+
+ // offset in bytes
+ GGML_API struct ggml_tensor * ggml_view_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_view_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ size_t nb1, // row stride in bytes
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_view_3d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ size_t nb1, // row stride in bytes
+ size_t nb2, // slice stride in bytes
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_view_4d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int64_t ne0,
+ int64_t ne1,
+ int64_t ne2,
+ int64_t ne3,
+ size_t nb1, // row stride in bytes
+ size_t nb2, // slice stride in bytes
+ size_t nb3,
+ size_t offset);
+
+ GGML_API struct ggml_tensor * ggml_permute(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int axis0,
+ int axis1,
+ int axis2,
+ int axis3);
+
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+ GGML_API struct ggml_tensor * ggml_transpose(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // supports 3D: a->ne[2] == b->ne[1]
+ GGML_API struct ggml_tensor * ggml_get_rows(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c);
+
+ GGML_API struct ggml_tensor * ggml_diag(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // set elements above the diagonal to -INF
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ // set elements above the diagonal to 0
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past);
+
+ GGML_API struct ggml_tensor * ggml_soft_max(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a);
+
+ // fused soft_max(a*scale + mask)
+ // mask is optional
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * mask,
+ float scale);
+
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // rotary position embedding
+ // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+ // if mode & 2 == 1, GPT-NeoX style
+ // if mode & 4 == 1, ChatGLM style
+ //
+ // b is an int32 vector with size a->ne[2], it contains the positions
+ GGML_API struct ggml_tensor * ggml_rope(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ int mode,
+ int n_ctx);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ int mode,
+ int n_ctx);
+
+ // custom RoPE
+ GGML_API struct ggml_tensor * ggml_rope_custom(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ int mode,
+ int n_ctx,
+ int n_orig_ctx,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow);
+
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ int mode,
+ int n_ctx,
+ int n_orig_ctx,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow);
+
+ // compute correction dims for YaRN RoPE scaling
+ GGML_CALL void ggml_rope_yarn_corr_dims(
+ int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+
+ // xPos RoPE, in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ float base,
+ bool down);
+
+ // rotary position embedding backward, i.e compute dx from dy
+ // a - dy
+ GGML_API struct ggml_tensor * ggml_rope_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int n_dims,
+ int mode,
+ int n_ctx,
+ int n_orig_ctx,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
+ float xpos_base,
+ bool xpos_down);
+
+ // alibi position embedding
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_alibi(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_past,
+ int n_head,
+ float bias_max);
+
+ // clamp
+ // in-place, returns view(a)
+ GGML_API struct ggml_tensor * ggml_clamp(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ float min,
+ float max);
+
+ GGML_API struct ggml_tensor * ggml_im2col(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int s1,
+ int p0,
+ int p1,
+ int d0,
+ int d1,
+ bool is_2D);
+
+ GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int s1,
+ int p0,
+ int p1,
+ int d0,
+ int d1);
+
+ GGML_API struct ggml_tensor * ggml_conv_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0, // stride
+ int p0, // padding
+ int d0); // dilation
+
+ // conv_1d with padding = half
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s,
+ int d);
+
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int p0,
+ int d0);
+
+ GGML_API struct ggml_tensor * ggml_conv_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int s0,
+ int s1,
+ int p0,
+ int p1,
+ int d0,
+ int d1);
+
+
+ // kernel size is a->ne[0] x a->ne[1]
+ // stride is equal to kernel size
+ // padding is zero
+ // example:
+ // a: 16 16 3 768
+ // b: 1024 1024 3 1
+ // res: 64 64 768 1
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ // kernel size is a->ne[0] x a->ne[1]
+ // stride is 1
+ // padding is half
+ // example:
+ // a: 3 3 256 256
+ // b: 64 64 256 1
+ // res: 64 64 256 1
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ int stride);
+
+ enum ggml_op_pool {
+ GGML_OP_POOL_MAX,
+ GGML_OP_POOL_AVG,
+ GGML_OP_POOL_COUNT,
+ };
+
+ GGML_API struct ggml_tensor * ggml_pool_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_op_pool op,
+ int k0, // kernel size
+ int s0, // stride
+ int p0); // padding
+
+ // the result will have 2*p0 padding for the first dimension
+ // and 2*p1 padding for the second dimension
+ GGML_API struct ggml_tensor * ggml_pool_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_op_pool op,
+ int k0,
+ int k1,
+ int s0,
+ int s1,
+ float p0,
+ float p1);
+
+ // nearest interpolate
+ // used in stable-diffusion
+ GGML_API struct ggml_tensor * ggml_upscale(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int scale_factor);
+
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+ GGML_API struct ggml_tensor * ggml_pad(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0,
+ int p1,
+ int p2,
+ int p3);
+
+ // sort rows
+ enum ggml_sort_order {
+ GGML_SORT_ASC,
+ GGML_SORT_DESC,
+ };
+
+ GGML_API struct ggml_tensor * ggml_argsort(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_sort_order order);
+
+ // top k elements per row
+ GGML_API struct ggml_tensor * ggml_top_k(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int k);
+
+ GGML_API struct ggml_tensor * ggml_flash_attn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * q,
+ struct ggml_tensor * k,
+ struct ggml_tensor * v,
+ bool masked);
+
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * q,
+ struct ggml_tensor * k,
+ struct ggml_tensor * v,
+ struct ggml_tensor * d,
+ bool masked);
+
+ GGML_API struct ggml_tensor * ggml_flash_ff(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b0,
+ struct ggml_tensor * b1,
+ struct ggml_tensor * c0,
+ struct ggml_tensor * c1);
+
+ // partition into non-overlapping windows with padding if needed
+ // example:
+ // a: 768 64 64 1
+ // w: 14
+ // res: 768 14 14 25
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_win_part(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int w);
+
+ // reverse of ggml_win_part
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_win_unpart(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int w0,
+ int h0,
+ int w);
+
+ GGML_API struct ggml_tensor * ggml_unary(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_unary_op op);
+
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_unary_op op);
+
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_get_rel_pos(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int qh,
+ int kh);
+
+ // used in sam
+ GGML_API struct ggml_tensor * ggml_add_rel_pos(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pw,
+ struct ggml_tensor * ph);
+
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pw,
+ struct ggml_tensor * ph);
+
+ // custom operators
+
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_unary_op_f32_t fun),
+ "use ggml_map_custom1 instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_unary_op_f32_t fun),
+ "use ggml_map_custom1_inplace instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_binary_op_f32_t fun),
+ "use ggml_map_custom2 instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_binary_op_f32_t fun),
+ "use ggml_map_custom2_inplace instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_custom1_op_f32_t fun),
+ "use ggml_map_custom1 instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_custom1_op_f32_t fun),
+ "use ggml_map_custom1_inplace instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_custom2_op_f32_t fun),
+ "use ggml_map_custom2 instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_custom2_op_f32_t fun),
+ "use ggml_map_custom2_inplace instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ ggml_custom3_op_f32_t fun),
+ "use ggml_map_custom3 instead");
+
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ ggml_custom3_op_f32_t fun),
+ "use ggml_map_custom3_inplace instead");
+
+ // custom operators v2
+
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+ #define GGML_N_TASKS_MAX -1
+
+ GGML_API struct ggml_tensor * ggml_map_custom1(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_custom1_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ ggml_custom1_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ GGML_API struct ggml_tensor * ggml_map_custom2(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_custom2_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ ggml_custom2_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ GGML_API struct ggml_tensor * ggml_map_custom3(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ ggml_custom3_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c,
+ ggml_custom3_op_t fun,
+ int n_tasks,
+ void * userdata);
+
+ // loss function
+
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b);
+
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * b,
+ struct ggml_tensor * c);
+
+ //
+ // automatic differentiation
+ //
+
+ GGML_API void ggml_set_param(
+ struct ggml_context * ctx,
+ struct ggml_tensor * tensor);
+
+
+ GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+ GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+
+ // graph allocation in a context
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+ GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
+
+ GGML_API size_t ggml_graph_overhead(void);
+ GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
+
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+ GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+ GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
+ // print info and performance information for the graph
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
+
+ // dump the graph into a file using the dot format
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+
+ // build gradient checkpointing backward graph gb for gf using provided checkpoints
+ // gb_tmp will contain original backward graph with rewritten backward process nodes,
+ // but without the second forward pass nodes.
+ GGML_API void ggml_build_backward_gradient_checkpointing(
+ struct ggml_context * ctx,
+ struct ggml_cgraph * gf,
+ struct ggml_cgraph * gb,
+ struct ggml_cgraph * gb_tmp,
+ struct ggml_tensor * * checkpoints,
+ int n_checkpoints);
+ //
+ // optimization
+ //
+
+ // optimization methods
+ enum ggml_opt_type {
+ GGML_OPT_ADAM,
+ GGML_OPT_LBFGS,
+ };
+
+ // linesearch methods
+ enum ggml_linesearch {
+ GGML_LINESEARCH_DEFAULT = 1,
+
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+ };
+
+ // optimization return values
+ enum ggml_opt_result {
+ GGML_OPT_OK = 0,
+ GGML_OPT_DID_NOT_CONVERGE,
+ GGML_OPT_NO_CONTEXT,
+ GGML_OPT_INVALID_WOLFE,
+ GGML_OPT_FAIL,
+ GGML_OPT_CANCEL,
+
+ GGML_LINESEARCH_FAIL = -128,
+ GGML_LINESEARCH_MINIMUM_STEP,
+ GGML_LINESEARCH_MAXIMUM_STEP,
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+ GGML_LINESEARCH_INVALID_PARAMETERS,
+ };
+
+ typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+
+ // optimization parameters
+ //
+ // see ggml.c (ggml_opt_default_params) for default values
+ //
+ struct ggml_opt_params {
+ enum ggml_opt_type type;
+
+ size_t graph_size;
+
+ int n_threads;
+
+ // delta-based convergence test
+ //
+ // if past == 0 - disabled
+ // if past > 0:
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+ //
+ int past;
+ float delta;
+
+ // maximum number of iterations without improvement
+ //
+ // if 0 - disabled
+ // if > 0:
+ // assume convergence if no cost improvement in this number of iterations
+ //
+ int max_no_improvement;
+
+ bool print_forward_graph;
+ bool print_backward_graph;
+
+ int n_gradient_accumulation;
+
+ // ADAM parameters
+ struct {
+ int n_iter;
+
+ float sched; // schedule multiplier (fixed, decay or warmup)
+ float decay; // weight decay for AdamW, use 0.0f to disable
+ int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
+ float alpha; // learning rate
+ float beta1;
+ float beta2;
+ float eps; // epsilon for numerical stability
+ float eps_f; // epsilon for convergence test
+ float eps_g; // epsilon for convergence test
+ float gclip; // gradient clipping
+ } adam;
+
+ // LBFGS parameters
+ struct {
+ int m; // number of corrections to approximate the inv. Hessian
+ int n_iter;
+ int max_linesearch;
+
+ float eps; // convergence tolerance
+ float ftol; // line search tolerance
+ float wolfe;
+ float min_step;
+ float max_step;
+
+ enum ggml_linesearch linesearch;
+ } lbfgs;
+ };
+
+ struct ggml_opt_context {
+ struct ggml_context * ctx;
+ struct ggml_opt_params params;
+
+ int iter;
+ int64_t nx; // number of parameter elements
+
+ bool just_initialized;
+
+ float loss_before;
+ float loss_after;
+
+ struct {
+ struct ggml_tensor * g; // current gradient
+ struct ggml_tensor * m; // first moment
+ struct ggml_tensor * v; // second moment
+ struct ggml_tensor * pf; // past function values
+ float fx_best;
+ float fx_prev;
+ int n_no_improvement;
+ } adam;
+
+ struct {
+ struct ggml_tensor * x; // current parameters
+ struct ggml_tensor * xp; // previous parameters
+ struct ggml_tensor * g; // current gradient
+ struct ggml_tensor * gp; // previous gradient
+ struct ggml_tensor * d; // search direction
+ struct ggml_tensor * pf; // past function values
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
+ struct ggml_tensor * lms; // the L-BFGS memory s
+ struct ggml_tensor * lmy; // the L-BFGS memory y
+ float fx_best;
+ float step;
+ int j;
+ int k;
+ int end;
+ int n_no_improvement;
+ } lbfgs;
+ };
+
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+
+ // optimize the function defined by the tensor f
+ GGML_API enum ggml_opt_result ggml_opt(
+ struct ggml_context * ctx,
+ struct ggml_opt_params params,
+ struct ggml_tensor * f);
+
+ // initialize optimizer context
+ GGML_API void ggml_opt_init(
+ struct ggml_context * ctx,
+ struct ggml_opt_context * opt,
+ struct ggml_opt_params params,
+ int64_t nx);
+
+ // continue optimizing the function defined by the tensor f
+ GGML_API enum ggml_opt_result ggml_opt_resume(
+ struct ggml_context * ctx,
+ struct ggml_opt_context * opt,
+ struct ggml_tensor * f);
+
+ // continue optimizing the function defined by the tensor f
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
+ struct ggml_context * ctx,
+ struct ggml_opt_context * opt,
+ struct ggml_tensor * f,
+ struct ggml_cgraph * gf,
+ struct ggml_cgraph * gb,
+ ggml_opt_callback callback,
+ void * callback_data);
+
+ //
+ // quantization
+ //
+
+ // - ggml_quantize_init can be called multiple times with the same type
+ // it will only initialize the quantization tables for the first call or after ggml_quantize_free
+ // automatically called by ggml_quantize_chunk for convenience
+ //
+ // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
+ // call this at the end of the program to avoid memory leaks
+ //
+ // note: these are thread-safe
+ //
+ GGML_API void ggml_quantize_init(enum ggml_type type);
+ GGML_API void ggml_quantize_free(void);
+
+ // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+
+ GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+ GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
+ // some quantization type cannot be used without an importance matrix
+ GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
+
+ // calls ggml_quantize_init internally (i.e. can allocate memory)
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+ //
+ // gguf
+ //
+
+ enum gguf_type {
+ GGUF_TYPE_UINT8 = 0,
+ GGUF_TYPE_INT8 = 1,
+ GGUF_TYPE_UINT16 = 2,
+ GGUF_TYPE_INT16 = 3,
+ GGUF_TYPE_UINT32 = 4,
+ GGUF_TYPE_INT32 = 5,
+ GGUF_TYPE_FLOAT32 = 6,
+ GGUF_TYPE_BOOL = 7,
+ GGUF_TYPE_STRING = 8,
+ GGUF_TYPE_ARRAY = 9,
+ GGUF_TYPE_UINT64 = 10,
+ GGUF_TYPE_INT64 = 11,
+ GGUF_TYPE_FLOAT64 = 12,
+ GGUF_TYPE_COUNT, // marks the end of the enum
+ };
+
+ struct gguf_context;
+
+ struct gguf_init_params {
+ bool no_alloc;
+
+ // if not NULL, create a ggml_context and allocate the tensor data in it
+ struct ggml_context ** ctx;
+ };
+
+ GGML_API struct gguf_context * gguf_init_empty(void);
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+ GGML_API void gguf_free(struct gguf_context * ctx);
+
+ GGML_API const char * gguf_type_name(enum gguf_type type);
+
+ GGML_API int gguf_get_version (const struct gguf_context * ctx);
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
+ GGML_API void * gguf_get_data (const struct gguf_context * ctx);
+
+ GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
+ GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
+
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
+
+ // will abort if the wrong type is used for the key
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
+
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
+
+ // overrides existing values or adds a new one
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+ // set or add KV pairs from another context
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+ // manage tensor info
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+ // writing gguf files can be done in 2 ways:
+ //
+ // - write the entire gguf_context to a binary file in a single pass:
+ //
+ // gguf_write_to_file(ctx, fname);
+ //
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+ //
+ // FILE * f = fopen(fname, "wb");
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+ // fwrite(f, ...);
+ // void * data = gguf_meta_get_meta_data(ctx);
+ // fseek(f, 0, SEEK_SET);
+ // fwrite(f, data, gguf_get_meta_size(ctx));
+ // free(data);
+ // fclose(f);
+ //
+
+ // write the entire context to a binary file
+ GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+
+ //
+ // system info
+ //
+
+ GGML_API int ggml_cpu_has_avx (void);
+ GGML_API int ggml_cpu_has_avx_vnni (void);
+ GGML_API int ggml_cpu_has_avx2 (void);
+ GGML_API int ggml_cpu_has_avx512 (void);
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
+ GGML_API int ggml_cpu_has_fma (void);
+ GGML_API int ggml_cpu_has_neon (void);
+ GGML_API int ggml_cpu_has_arm_fma (void);
+ GGML_API int ggml_cpu_has_metal (void);
+ GGML_API int ggml_cpu_has_f16c (void);
+ GGML_API int ggml_cpu_has_fp16_va (void);
+ GGML_API int ggml_cpu_has_wasm_simd (void);
+ GGML_API int ggml_cpu_has_blas (void);
+ GGML_API int ggml_cpu_has_cublas (void);
+ GGML_API int ggml_cpu_has_clblast (void);
+ GGML_API int ggml_cpu_has_gpublas (void);
+ GGML_API int ggml_cpu_has_sse3 (void);
+ GGML_API int ggml_cpu_has_ssse3 (void);
+ GGML_API int ggml_cpu_has_vsx (void);
+
+ //
+ // Internal types and functions exposed for tests and benchmarks
+ //
+
+#ifdef __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+
+ typedef struct {
+ const char * type_name;
+ int blck_size;
+ size_t type_size;
+ bool is_quantized;
+ ggml_to_float_t to_float;
+ ggml_from_float_t from_float;
+ ggml_from_float_t from_float_reference;
+ ggml_vec_dot_t vec_dot;
+ enum ggml_type vec_dot_type;
+ } ggml_type_traits_t;
+
+ GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama.xcframework/ios-arm64/llama.framework/Headers/grammar-parser.h b/llama.xcframework/ios-arm64/llama.framework/Headers/grammar-parser.h
new file mode 100644
index 000000000..9037d7272
--- /dev/null
+++ b/llama.xcframework/ios-arm64/llama.framework/Headers/grammar-parser.h
@@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by llama.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root ::= expr
+// expr ::= term ([-+*/] term)*
+// term ::= num | "(" space expr ")" space
+// num ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "llama.h"
+#include
+#include