This commit is contained in:
Elton Kola 2024-05-23 17:01:03 -04:00
commit 2712886325
246 changed files with 80363 additions and 49232 deletions

View file

@ -10,14 +10,12 @@ WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \ echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \ fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build . --config Release --target main cmake --build build --config Release --target main
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

View file

@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN cmake -B build -DLLAMA_VULKAN=1 && \
cd build && \ cmake --build build --config Release --target main
cmake .. -DLLAMA_VULKAN=1 && \
cmake --build . --config Release --target main
# Clean up # Clean up
WORKDIR / WORKDIR /

View file

@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan) (cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic) (cmakeBool "LLAMA_STATIC" enableStatic)
] ]
@ -227,20 +226,20 @@ effectiveStdenv.mkDerivation (
) )
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_C_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
]; ];
# Environment variables needed for ROCm
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''

View file

@ -10,14 +10,12 @@ WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
cd build && \
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \ echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \ fi && \
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build . --config Release --target server cmake --build build --config Release --target server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

View file

@ -18,10 +18,8 @@ RUN apt-get update && \
# Build it # Build it
WORKDIR /app WORKDIR /app
COPY . . COPY . .
RUN mkdir build && \ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cd build && \ cmake --build build --config Release --target server
cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build . --config Release --target server
# Clean up # Clean up
WORKDIR / WORKDIR /

16
.flake8
View file

@ -1,3 +1,17 @@
[flake8] [flake8]
max-line-length = 125 max-line-length = 125
ignore = W503 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude =
# Do not traverse examples
examples,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory
.git,
# There's no value in checking cache directories
__pycache__,
# No need to include the build path
build,
# This contains builds that we don't want to check
dist # This is generated with `python build .` for package releases
# max-complexity = 10

78
.github/labeler.yml vendored Normal file
View file

@ -0,0 +1,78 @@
# https://github.com/actions/labeler
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml-sycl.h
- ggml-sycl.cpp
- README-sycl.md
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml_vk_generate_shaders.py
- ggml-vulkan*
documentation:
- changed-files:
- any-glob-to-any-file:
- docs/**
- media/**
testing:
- changed-files:
- any-glob-to-any-file:
- tests/**
build:
- changed-files:
- any-glob-to-any-file:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server:
- changed-files:
- any-glob-to-any-file:
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml.c
- ggml.h
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
nix:
- changed-files:
- any-glob-to-any-file:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/

View file

@ -52,7 +52,19 @@ jobs:
ftype: q4_0 ftype: q4_0
pr_comment_enabled: "true" pr_comment_enabled: "true"
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
@ -96,9 +108,7 @@ jobs:
id: cmake_build id: cmake_build
run: | run: |
set -eux set -eux
mkdir build cmake -B build \
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \ -DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
@ -109,7 +119,7 @@ jobs:
-DLLAMA_FATAL_WARNINGS=OFF \ -DLLAMA_FATAL_WARNINGS=OFF \
-DLLAMA_ALL_WARNINGS=OFF \ -DLLAMA_ALL_WARNINGS=OFF \
-DCMAKE_BUILD_TYPE=Release; -DCMAKE_BUILD_TYPE=Release;
cmake --build . --config Release -j $(nproc) --target server cmake --build build --config Release -j $(nproc) --target server
- name: Download the dataset - name: Download the dataset
id: download_dataset id: download_dataset

View file

@ -271,49 +271,15 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip name: llama-bin-ubuntu-x64.zip
# ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest runs-on: ubuntu-latest
continue-on-error: true continue-on-error: true
strategy: strategy:
matrix: matrix:
mpi_library: [mpich, libopenmpi-dev] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps: steps:
- name: Clone - name: Clone
@ -324,14 +290,44 @@ jobs:
id: depends id: depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }} sudo apt-get install build-essential
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_MPI=ON .. cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_RPC=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
- name: Test - name: Test
@ -362,6 +358,33 @@ jobs:
cmake -DLLAMA_VULKAN=ON .. cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -593,6 +616,63 @@ jobs:
run: | run: |
make swift make swift
windows-msys2:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
include:
- { sys: UCRT64, env: ucrt-x86_64, build: Release }
- { sys: CLANG64, env: clang-x86_64, build: Release }
steps:
- name: Clone
uses: actions/checkout@v4
- name: Setup ${{ matrix.sys }}
uses: msys2/setup-msys2@v2
with:
update: true
msystem: ${{matrix.sys}}
install: >-
base-devel
mingw-w64-${{matrix.env}}-toolchain
mingw-w64-${{matrix.env}}-cmake
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make LLAMA_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
cmake -B build
cmake --build build --config ${{ matrix.build }} -j $(nproc)
- name: Clean after building using CMake
shell: msys2 {0}
run: |
rm -rf build
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
@ -606,24 +686,28 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- build: 'noavx' - build: 'rpc-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute' - build: 'kompute-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan' - build: 'vulkan-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'arm64' - build: 'llvm-arm64'
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps: steps:
- name: Clone - name: Clone
@ -634,13 +718,13 @@ jobs:
- name: Clone Kompute submodule - name: Clone Kompute submodule
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute' }} if: ${{ matrix.build == 'kompute-x64' }}
run: | run: |
git submodule update --init kompute git submodule update --init kompute
- name: Download OpenCL SDK - name: Download OpenCL SDK
id: get_opencl id: get_opencl
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip" curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl mkdir $env:RUNNER_TEMP/opencl
@ -648,7 +732,7 @@ jobs:
- name: Download CLBlast - name: Download CLBlast
id: get_clblast id: get_clblast
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@ -661,7 +745,7 @@ jobs:
- name: Download OpenBLAS - name: Download OpenBLAS
id: get_openblas id: get_openblas
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@ -674,38 +758,41 @@ jobs:
- name: Install Vulkan SDK - name: Install Vulkan SDK
id: get_vulkan id: get_vulkan
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -S . -B build ${{ matrix.defines }}
cd build cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll - name: Add clblast.dll
id: add_clblast_dll id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll - name: Add libopenblas.dll
id: add_libopenblas_dll id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support - name: Check AVX512F support
id: check_avx512f id: check_avx512f
if: ${{ matrix.build == 'avx512' }} if: ${{ matrix.build == 'avx512-x64' }}
continue-on-error: true continue-on-error: true
run: | run: |
cd build cd build
@ -719,14 +806,14 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE) - name: Test (Intel SDE)
id: cmake_test_sde id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: | run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz # for some weird reason windows tar doesn't like sde tar.xz
@ -754,14 +841,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}-x64.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-latest
@ -841,9 +928,9 @@ jobs:
shell: bash shell: bash
env: env:
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
steps: steps:
- name: Clone - name: Clone
id: checkout id: checkout
@ -875,6 +962,17 @@ jobs:
id: pack_artifacts id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload artifacts - name: Upload artifacts
@ -884,6 +982,37 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest

View file

@ -12,7 +12,7 @@ jobs:
steps: steps:
- uses: actions/stale@v5 - uses: actions/stale@v5
with: with:
exempt-issue-labels: "refactor,help wanted,good first issue,research" exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
days-before-issue-stale: 30 days-before-issue-stale: 30
days-before-issue-close: 14 days-before-issue-close: 14
stale-issue-label: "stale" stale-issue-label: "stale"

17
.github/workflows/labeler.yml vendored Normal file
View file

@ -0,0 +1,17 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'

View file

@ -20,5 +20,4 @@ jobs:
- name: flake8 Lint - name: flake8 Lint
uses: py-actions/flake8@v2 uses: py-actions/flake8@v2
with: with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" plugins: "flake8-no-print"
exclude: "examples/*,examples/*/**,*/**/__init__.py"

View file

@ -32,10 +32,8 @@ jobs:
strategy: strategy:
matrix: matrix:
# TODO: temporary disabled due to linux kernel issues sanitizer: [ADDRESS, THREAD, UNDEFINED]
#sanitizer: [ADDRESS, THREAD, UNDEFINED] build_type: [RelWithDebInfo]
sanitizer: [UNDEFINED]
build_type: [Debug]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
@ -94,20 +92,16 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -B build \
cd build
cmake .. \
-DLLAMA_NATIVE=OFF \ -DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \ -DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \ -DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh
@ -143,10 +137,8 @@ jobs:
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cd build cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup - name: Python setup
id: setup_python id: setup_python

View file

@ -1,29 +0,0 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

16
.gitignore vendored
View file

@ -2,6 +2,7 @@
*.a *.a
*.so *.so
*.gguf *.gguf
*.gguf.json
*.bin *.bin
*.exe *.exe
*.dll *.dll
@ -108,3 +109,18 @@ examples/server/*.mjs.hpp
poetry.lock poetry.lock
poetry.toml poetry.toml
nppBackup nppBackup
# Test binaries
/tests/test-grammar-parser
/tests/test-llama-grammar
/tests/test-double-float
/tests/test-grad0
/tests/test-opt
/tests/test-quantize-fns
/tests/test-quantize-perf
/tests/test-sampling
/tests/test-tokenizer-0
/tests/test-tokenizer-1-spm
/tests/test-tokenizer-1-bpe
/tests/test-rope
/tests/test-backend-ops

View file

@ -3,13 +3,14 @@
exclude: prompts/.*.txt exclude: prompts/.*.txt
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0 rev: v4.6.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer
- id: check-yaml - id: check-yaml
- id: check-added-large-files - id: check-added-large-files
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 6.0.0 rev: 7.0.0
hooks: hooks:
- id: flake8 - id: flake8
additional_dependencies: [flake8-no-print]

View file

@ -77,6 +77,7 @@ option(LLAMA_AVX2 "llama: enable AVX2"
option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
# in MSVC F16C is implied with AVX2/AVX512 # in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC) if (NOT MSVC)
@ -103,6 +104,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access") "llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@ -120,8 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version") "llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@ -131,6 +133,8 @@ set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeli
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments # add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF) option(LLAMA_PERF "llama: enable perf" OFF)
@ -294,7 +298,7 @@ if (LLAMA_BLAS)
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(BLA_STATIC ON) set(BLA_STATIC ON)
endif() endif()
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22) if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8) set(BLA_SIZEOF_INTEGER 8)
endif() endif()
@ -379,10 +383,6 @@ if (LLAMA_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp) set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif() endif()
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON) set(LLAMA_CUDA ON)
@ -403,12 +403,16 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
add_compile_definitions(GGML_USE_CUDA) add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ) if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif() endif()
if (LLAMA_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y) if (DEFINED LLAMA_CUDA_DMMV_Y)
@ -425,7 +429,7 @@ if (LLAMA_CUDA)
if (LLAMA_STATIC) if (LLAMA_STATIC)
if (WIN32) if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else () else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@ -434,7 +438,11 @@ if (LLAMA_CUDA)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) if (LLAMA_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard # 52 == lowest CUDA 12 standard
@ -455,33 +463,15 @@ if (LLAMA_CUDA)
endif() endif()
endif() endif()
if (LLAMA_MPI) if (LLAMA_RPC)
cmake_minimum_required(VERSION 3.10) add_compile_definitions(GGML_USE_RPC)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h) if (WIN32)
set(GGML_SOURCES_MPI ggml-mpi.c) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
add_compile_options(-Wno-cast-qual)
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(GGML_HEADERS_RPC ggml-rpc.h)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) set(GGML_SOURCES_RPC ggml-rpc.cpp)
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
endif() endif()
if (LLAMA_CLBLAST) if (LLAMA_CLBLAST)
@ -510,6 +500,12 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN) add_compile_definitions(GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()
if (LLAMA_VULKAN_CHECK_RESULTS) if (LLAMA_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif() endif()
@ -533,16 +529,37 @@ if (LLAMA_VULKAN)
endif() endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH /opt/rocm)
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") # CMake on Windows doesn't support the HIP language yet
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") if(WIN32)
set(CXX_IS_HIPCC TRUE)
else()
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
endif() endif()
if(CXX_IS_HIPCC)
if(LINUX)
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif() endif()
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
endif()
find_package(hip REQUIRED) find_package(hip REQUIRED)
find_package(hipblas REQUIRED) find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED) find_package(rocblas REQUIRED)
@ -576,13 +593,18 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
if (LLAMA_STATIC) if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm") message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
endif() endif()
if (LLAMA_SYCL) if (LLAMA_SYCL)
@ -985,6 +1007,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@ -1037,6 +1064,10 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif() endif()
if (LLAMA_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
elseif (LLAMA_AVX2) elseif (LLAMA_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX) elseif (LLAMA_AVX)
@ -1068,6 +1099,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (LLAMA_AVX512_VNNI) if (LLAMA_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
if (LLAMA_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
@ -1077,6 +1111,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (LLAMA_LASX)
list(APPEND ARCH_FLAGS -mlasx)
endif()
if (LLAMA_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
@ -1165,7 +1210,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
@ -1252,7 +1297,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER) install(TARGETS ggml PUBLIC_HEADER)
@ -1271,17 +1316,6 @@ install(
WORLD_READ WORLD_READ
WORLD_EXECUTE WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR}) DESTINATION ${CMAKE_INSTALL_BINDIR})
install(
FILES convert-lora-to-ggml.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (LLAMA_METAL) if (LLAMA_METAL)
install( install(
FILES ggml-metal.metal FILES ggml-metal.metal

45
CMakePresets.json Normal file
View file

@ -0,0 +1,45 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
]
}

View file

@ -6,11 +6,23 @@ BUILD_TARGETS = \
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-autorelease \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-backend-ops \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-double-float \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ tests/test-grad0 \
tests/test-json-schema-to-grammar tests/test-grammar-integration tests/test-grammar-integration \
tests/test-grammar-parser \
tests/test-json-schema-to-grammar \
tests/test-llama-grammar \
tests/test-model-load-cancel \
tests/test-opt \
tests/test-quantize-fns \
tests/test-quantize-perf \
tests/test-rope \
tests/test-sampling \
tests/test-tokenizer-0 \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -27,6 +39,17 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
# In GNU make default CXX is g++ instead of c++. Let's fix that so that users
# of non-gcc compilers don't have to provide g++ alias or wrapper.
DEFCC := cc
DEFCXX := c++
ifeq ($(origin CC),default)
CC := $(DEFCC)
endif
ifeq ($(origin CXX),default)
CXX := $(DEFCXX)
endif
# Mac OS + Arm can report x86_64 # Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -49,11 +72,16 @@ default: $(BUILD_TARGETS)
test: $(TEST_TARGETS) test: $(TEST_TARGETS)
@failures=0; \ @failures=0; \
for test_target in $(TEST_TARGETS); do \ for test_target in $(TEST_TARGETS); do \
if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \ continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
continue; \ continue; \
@ -351,15 +379,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
CUDA_POWER_ARCH = 1 CUDA_POWER_ARCH = 1
endif endif
ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CFLAGS += -mlasx
MK_CXXFLAGS += -mlasx
endif
else else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@ -371,13 +400,6 @@ ifndef LLAMA_NO_ACCELERATE
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@ -405,7 +427,7 @@ ifdef LLAMA_CUDA
else else
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
endif endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@ -532,10 +554,10 @@ endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else else
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif endif
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
@ -547,7 +569,7 @@ ifdef LLAMA_HIP_UMA
endif # LLAMA_HIP_UMA endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@ -601,11 +623,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
@ -971,11 +988,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -983,7 +996,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -185,9 +185,8 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
```sh ```sh
git clone https://github.com/oneapi-src/oneMKL git clone https://github.com/oneapi-src/oneMKL
cd oneMKL cd oneMKL
mkdir -p buildWithCublas && cd buildWithCublas cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas cmake --build buildWithCublas --config Release
make
``` ```
@ -227,16 +226,15 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
source /opt/intel/oneapi/setvars.sh source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU # Build LLAMA with MKL BLAS acceleration for intel GPU
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16 # Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary # build all binary
cmake --build . --config Release -j -v cmake --build build --config Release -j -v
``` ```
#### Nvidia GPU #### Nvidia GPU
@ -248,16 +246,15 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL # Build LLAMA with Nvidia BLAS acceleration through SYCL
mkdir -p build && cd build
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16 # Option 2: Use FP16
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
# build all binary # build all binary
cmake --build . --config Release -j -v cmake --build build --config Release -j -v
``` ```
@ -412,17 +409,15 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
On the oneAPI command line window, step into the llama.cpp main directory and run the following: On the oneAPI command line window, step into the llama.cpp main directory and run the following:
``` ```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases) # Option 1: Use FP32 (recommended for better performance in most cases)
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 2: Or FP16 # Option 2: Or FP16
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
make -j cmake --build build --config Release -j
``` ```
Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:

251
README.md
View file

@ -2,7 +2,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +20,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics ### Hot topics
- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387** - **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 - Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
@ -105,7 +107,6 @@ Typically finetunes of the base models below are supported as well.
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@ -126,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md)) (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
@ -139,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**HTTP server** **HTTP server**
@ -174,6 +177,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary) - [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@ -298,7 +302,7 @@ cd llama.cpp
### Build ### Build
In order to build llama.cpp you have three different options. In order to build llama.cpp you have four different options.
- Using `make`: - Using `make`:
- On Linux or MacOS: - On Linux or MacOS:
@ -307,6 +311,8 @@ In order to build llama.cpp you have three different options.
make make
``` ```
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
- On Windows: - On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -321,10 +327,24 @@ In order to build llama.cpp you have three different options.
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build
cd build cmake --build build --config Release
cmake .. ```
cmake --build . --config Release
**Note**: for `Debug` builds, there are two cases:
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
``` ```
- Using `Zig` (version 0.11 or later): - Using `Zig` (version 0.11 or later):
@ -363,45 +383,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument. argument.
### MPI Build
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
- Using `make`:
```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```
- Using `CMake`:
```bash
cmake -S . -B build -DLLAMA_MPI=ON
```
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
Here is an example hostfile:
```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### BLAS Build ### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
@ -438,10 +419,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` on Linux: - Using `CMake` on Linux:
```bash ```bash
mkdir build cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cd build cmake --build build --config Release
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build . --config Release
``` ```
- #### BLIS - #### BLIS
@ -461,11 +440,9 @@ Building the program with BLAS support may lead to some performance improvements
- Using manual oneAPI installation: - Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash ```bash
mkdir build
cd build
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release cmake --build build --config Release
``` ```
- Using oneAPI docker image: - Using oneAPI docker image:
@ -486,10 +463,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build -DLLAMA_CUDA=ON
cd build cmake --build build --config Release
cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
``` ```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
@ -515,12 +490,27 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16 && cmake --build build -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
- Using `make` (example for target gfx1030, build with 16 CPU threads): - Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash ```bash
@ -530,10 +520,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash ```bash
set PATH=%HIP_PATH%\bin;%PATH% set PATH=%HIP_PATH%\bin;%PATH%
mkdir build cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cd build cmake --build build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
cmake --build .
``` ```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@ -563,15 +551,14 @@ Building the program with BLAS support may lead to some performance improvements
```sh ```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
mkdir OpenCL-SDK/build cd OpenCL-SDK
cd OpenCL-SDK/build cmake -B build -DBUILD_DOCS=OFF \
cmake .. -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \ -DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \ -DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \ -DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF -DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build . --config Release cmake --build build
cmake --install . --prefix /some/path cmake --install build --prefix /some/path
``` ```
</details> </details>
@ -593,23 +580,23 @@ Building the program with BLAS support may lead to some performance improvements
```cmd ```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64" set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast\build cd CLBlast
cd CLBlast\build cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64 cmake --build build --config Release
cmake --build . --config Release cmake --install build --prefix C:/CLBlast
cmake --install . --prefix C:/CLBlast
``` ```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details> - <details>
<summary>Unix:</summary> <summary>Unix:</summary>
```sh ```sh
git clone https://github.com/CNugteren/CLBlast.git git clone https://github.com/CNugteren/CLBlast.git
mkdir CLBlast/build cd CLBlast
cd CLBlast/build cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build build --config Release
cmake --build . --config Release cmake --install build --prefix /some/path
cmake --install . --prefix /some/path
``` ```
Where `/some/path` is where the built library will be installed (default is `/usr/local`). Where `/some/path` is where the built library will be installed (default is `/usr/local`).
@ -623,21 +610,17 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
- CMake (Unix): - CMake (Unix):
```sh ```sh
mkdir build cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cd build cmake --build build --config Release
cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build . --config Release
``` ```
- CMake (Windows): - CMake (Windows):
```cmd ```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast" set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/LlamaCPP
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
``` ```
##### Running Llama with CLBlast ##### Running Llama with CLBlast
@ -693,10 +676,8 @@ Building the program with BLAS support may lead to some performance improvements
Then, build llama.cpp using the cmake command below: Then, build llama.cpp using the cmake command below:
```bash ```bash
mkdir -p build cmake -B build -DLLAMA_VULKAN=1
cd build cmake --build build --config Release
cmake .. -DLLAMA_VULKAN=1
cmake --build . --config Release
# Test the output binary (with "-ngl 33" to offload all layers to GPU) # Test the output binary (with "-ngl 33" to offload all layers to GPU)
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
@ -706,8 +687,13 @@ Building the program with BLAS support may lead to some performance improvements
### Prepare and Quantize ### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash ```bash
# obtain the official LLaMA model weights and place them in ./models # obtain the official LLaMA model weights and place them in ./models
ls ./models ls ./models
@ -929,17 +915,25 @@ If your issue is with model generation quality, then please at least scan the fo
### Android ### Android
#### Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
#### Building the Project using Android NDK #### Building the Project using Android NDK
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/). Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
First, install the essential packages for termux:
```
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
``` ```
$ mkdir build-android $ mkdir build-android
$ cd build-android $ cd build-android
@ -947,7 +941,9 @@ $ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make $ make
``` ```
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@ -969,53 +965,10 @@ $cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml $./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
``` ```
Here is a demo of an interactive session running on Pixel 5 phone: Here's a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
#### Building the Project using Termux (F-Droid)
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
```
apt install libopenblas
```
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
```
apt install ocl-icd opencl-headers opencl-clhpp clinfo
```
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
```
cmake .
make
cp libclblast.so* $PREFIX/lib
cp ./include/clblast.h ../llama.cpp
```
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
```
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
```
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker ### Docker
#### Prerequisites #### Prerequisites

172
build.zig
View file

@ -1,172 +0,0 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const target = builder.standardTargetOptions(.{});
const zig_version = @import("builtin").zig_version_string;
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"common"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
// https://github.com/ziglang/zig/issues/15448
if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
for (server_assets) |asset| {
const input_path = b.fmt("examples/server/public/{s}", .{asset});
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
defer b.allocator.free(input);
var buf = std.ArrayList(u8).init(b.allocator);
defer buf.deinit();
for (input) |byte| {
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
}
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
defer b.allocator.free(name);
std.mem.replaceScalar(u8, name, '.', '_');
try std.fs.cwd().writeFile(output_path, b.fmt(
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
.{ name, buf.items, name, input.len },
));
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
}
}

321
ci/run.sh
View file

@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
set -e set -e
# TODO: too slow, run on dedicated node
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e set +e
} }
@ -203,12 +202,15 @@ function gg_sum_test_scripts_release {
} }
function gg_get_model { function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
if [[ -s $gguf_3b ]]; then local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
echo -n "$gguf_3b" if [[ -s $gguf_0 ]]; then
elif [[ -s $gguf_7b ]]; then echo -n "$gguf_0"
echo -n "$gguf_7b" elif [[ -s $gguf_1 ]]; then
echo -n "$gguf_1"
elif [[ -s $gguf_2 ]]; then
echo -n "$gguf_2"
else else
echo >&2 "No model found. Can't run gg_run_ctest_with_model." echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1 exit 1
@ -257,33 +259,169 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n' gg_printf '```\n'
} }
# open_llama_3b_v2 # open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_3b_v2 { function gg_run_open_llama_7b_v2 {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/3B-v2" path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw" path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models} python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_1.4b
function gg_run_pythia_1_4b {
cd ${SRC}
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/pythia/1.4B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -337,6 +475,7 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -357,7 +496,7 @@ function gg_run_open_llama_3b_v2 {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@ -365,58 +504,16 @@ function gg_run_open_llama_3b_v2 {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/3B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
function gg_sum_open_llama_3b_v2 { function gg_sum_pythia_1_4b {
gg_printf '### %s\n\n' "${ci}" gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf 'Pythia 1.4B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -429,32 +526,24 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# open_llama_7b_v2 # pythia_2_8b
# requires: GG_BUILD_CUDA # requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 { function gg_run_pythia_2_8b {
cd ${SRC} cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2" path_models="../models-mnt/pythia/2.8B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw" path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@ -464,7 +553,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models} python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -517,7 +606,10 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl { function check_ppl {
qnt="$1" qnt="$1"
@ -538,7 +630,7 @@ function gg_run_open_llama_7b_v2 {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@ -546,59 +638,16 @@ function gg_run_open_llama_7b_v2 {
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
# lora
function compare_ppl {
qnt="$1"
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
return 20
fi
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
return 0
}
path_lora="../models-mnt/open-llama/7B-v2/lora"
path_shakespeare="../models-mnt/shakespeare"
shakespeare="${path_shakespeare}/shakespeare.txt"
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
python3 ../convert-lora-to-ggml.py ${path_lora}
# f16
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# currently not supported by the CUDA backend
# q8_0
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
# q8_0 + f16 lora-base
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
set +e set +e
} }
function gg_sum_open_llama_7b_v2 { function gg_sum_pythia_2_8b {
gg_printf '### %s\n\n' "${ci}" gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf 'Pythia 2.8B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -611,11 +660,6 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
} }
# bge-small # bge-small
@ -644,7 +688,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -691,14 +735,17 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release test $ret -eq 0 && gg_run test_scripts_release
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 test $ret -eq 0 && gg_run pythia_1_4b
else else
test $ret -eq 0 && gg_run open_llama_7b_v2 test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi fi
test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release test $ret -eq 0 && gg_run ctest_with_model_release

View file

@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -0,0 +1,6 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

File diff suppressed because it is too large Load diff

View file

@ -31,6 +31,8 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT; extern char const * LLAMA_COMMIT;
@ -39,8 +41,12 @@ extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info; struct llama_control_vector_load_info;
int get_math_cpu_count(); //
int32_t get_num_physical_cores(); // CPU utils
//
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
// //
// CLI argument parsing // CLI argument parsing
@ -49,7 +55,7 @@ int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_math_cpu_count(); int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1; int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1; int32_t n_threads_batch_draft = -1;
@ -80,6 +86,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
@ -92,7 +99,7 @@ struct gpt_params {
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = ""; // model path
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download
@ -133,11 +140,13 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL-divergence bool kl_divergence = false; // compute KL divergence
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@ -148,6 +157,7 @@ struct gpt_params {
bool multiline_input = false; // reverse the usage of `\` bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens bool ignore_eos = false; // ignore generated EOS tokens
@ -168,35 +178,39 @@ struct gpt_params {
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file std::vector<std::string> image; // path to image file(s)
}; };
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params); bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
bool validate_file_name(const std::string & filename);
// //
// String utils // String utils
// //
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator); std::vector<std::string> string_split(std::string input, char separator);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
//
// Filesystem utils
//
bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
// //
// Model utils // Model utils
@ -267,29 +281,15 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false. // defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model); bool llama_should_add_bos_token(const llama_model * model);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
// //
// KV cache utils // KV cache utils
// //
// Dump the KV cache view with the number of sequences per cell. // Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output). // Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
// //
// Embedding utils // Embedding utils
@ -323,6 +323,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
// //
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count"; static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
//
// YAML utils
//
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -26,7 +26,7 @@ namespace grammar_parser {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second; return result.first->second;
} }
@ -142,6 +142,9 @@ namespace grammar_parser {
pos++; pos++;
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
while (*pos != '"') { while (*pos != '"') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos); auto char_pair = parse_char(pos);
pos = char_pair.second; pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@ -156,6 +159,9 @@ namespace grammar_parser {
} }
last_sym_start = out_elements.size(); last_sym_start = out_elements.size();
while (*pos != ']') { while (*pos != ']') {
if (!*pos) {
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos); auto char_pair = parse_char(pos);
pos = char_pair.second; pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size() enum llama_gretype type = last_sym_start < out_elements.size()
@ -164,6 +170,9 @@ namespace grammar_parser {
out_elements.push_back({type, char_pair.first}); out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') { if (pos[0] == '-' && pos[1] != ']') {
if (!pos[1]) {
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1); auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second; pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

View file

@ -272,7 +272,7 @@ private:
if (literal.empty()) { if (literal.empty()) {
return false; return false;
} }
ret.push_back(std::make_pair(literal, true)); ret.emplace_back(literal, true);
literal.clear(); literal.clear();
return true; return true;
}; };
@ -298,7 +298,7 @@ private:
while (i < length) { while (i < length) {
char c = sub_pattern[i]; char c = sub_pattern[i];
if (c == '.') { if (c == '.') {
seq.push_back(std::make_pair(get_dot(), false)); seq.emplace_back(get_dot(), false);
i++; i++;
} else if (c == '(') { } else if (c == '(') {
i++; i++;
@ -307,7 +307,7 @@ private:
_warnings.push_back("Unsupported pattern syntax"); _warnings.push_back("Unsupported pattern syntax");
} }
} }
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false)); seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') { } else if (c == ')') {
i++; i++;
if (start > 0 && sub_pattern[start - 1] != '(') { if (start > 0 && sub_pattern[start - 1] != '(') {
@ -331,9 +331,9 @@ private:
} }
square_brackets += ']'; square_brackets += ']';
i++; i++;
seq.push_back(std::make_pair(square_brackets, false)); seq.emplace_back(square_brackets, false);
} else if (c == '|') { } else if (c == '|') {
seq.push_back(std::make_pair("|", false)); seq.emplace_back("|", false);
i++; i++;
} else if (c == '*' || c == '+' || c == '?') { } else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false); seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@ -417,7 +417,7 @@ private:
} }
} }
if (!literal.empty()) { if (!literal.empty()) {
seq.push_back(std::make_pair(literal, true)); seq.emplace_back(literal, true);
} }
} }
} }

View file

@ -1,4 +1,8 @@
#pragma once #pragma once
#include "ggml.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

View file

@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_FLF_FMT "%s" #define LOG_FLF_FMT "%s"
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_TEE_FLF_FMT "%s" #define LOG_TEE_FLF_FMT "%s"
@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG() INSTEAD // USE LOG() INSTEAD
// //
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_IMPL(str, ...) \ #define LOG_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE // INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD // USE LOG_TEE() INSTEAD
// //
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \ #define LOG_TEE_IMPL(str, ...) \
do { \ do { \
if (LOG_TARGET != nullptr) \ if (LOG_TARGET != nullptr) \
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Main LOG macro. // Main LOG macro.
// behaves like printf, and supports arguments the exact same way. // behaves like printf, and supports arguments the exact same way.
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else #else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Secondary target can be changed just like LOG_TARGET // Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET // by defining LOG_TEE_TARGET
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else #else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif #endif
// LOG macro variants with auto endline. // LOG macro variants with auto endline.
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else #else

View file

@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev); result->prev.resize(params.n_prev);
result->n_valid = 0;
llama_sampling_set_rng_seed(result, params.seed); llama_sampling_set_rng_seed(result, params.seed);
return result; return result;
@ -64,11 +66,12 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
std::fill(ctx->prev.begin(), ctx->prev.end(), 0); std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear(); ctx->cur.clear();
ctx->n_valid = 0;
} }
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) { if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL); seed = std::random_device{}();
} }
ctx->rng.seed(seed); ctx->rng.seed(seed);
} }
@ -122,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties "; std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) { for (auto sampler_type : params.samplers_sequence) {
const auto sampler_type_name = sampler_type_to_name_string(sampler_type); const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) { if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " "; result += "-> " + sampler_type_name + " ";
} }
@ -134,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
return result; return result;
} }
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header // no reasons to expose this function in header
static void sampler_queue( static void sampler_queue(
struct llama_context * ctx_main, struct llama_context * ctx_main,
@ -176,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
struct llama_context * ctx_main, struct llama_context * ctx_main,
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx, const int idx,
bool is_resampling) { // Add a parameter to indicate if we are resampling bool is_resampling) {
const llama_sampling_params & params = ctx_sampling->params; const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp; const float temp = params.temp;
@ -185,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
const float mirostat_eta = params.mirostat_eta; const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits; std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (!is_resampling) { if (ctx_sampling->grammar != NULL && !is_resampling) {
GGML_ASSERT(!original_logits.empty()); GGML_ASSERT(!original_logits.empty());
} }
llama_token id = 0; llama_token id = 0;
@ -249,10 +333,12 @@ static llama_token llama_sampling_sample_impl(
// Restore logits from the copy // Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits); std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
} }
} }
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
return id; return id;
} }
@ -280,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
// Get a pointer to the logits // Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx); float * logits = llama_get_logits_ith(ctx_main, idx);
if (apply_grammar && original_logits != NULL) { if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
} }
@ -337,7 +424,7 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx) { const int idx) {
// Call the implementation function with is_resampling set to false by default // Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
} }
llama_token_data_array llama_sampling_prepare( llama_token_data_array llama_sampling_prepare(

View file

@ -81,6 +81,7 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> prev; std::vector<llama_token> prev;
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
size_t n_valid; // Number of correct top tokens with correct probabilities.
std::mt19937 rng; std::mt19937 rng;
}; };
@ -115,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string // Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params); std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience // this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function // it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call // Note: When using multiple sequences, it is the caller's responsibility to call

View file

@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
void finish_processing_train_args(struct train_params_common * params) { void finish_processing_train_args(struct train_params_common * params) {
if (params->escape) { if (params->escape) {
process_escapes(params->sample_start); string_process_escapes(params->sample_start);
} }
} }

326
convert-hf-to-gguf-update.py Executable file
View file

@ -0,0 +1,326 @@
#!/usr/bin/env python3
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
# This is necessary in order to analyze the type of pre-tokenizer used by the model and
# provide the necessary information to llama.cpp via the GGUF header in order to implement
# the same pre-tokenizer.
#
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
#
# Instructions:
#
# - Add a new model to the "models" list
# - Run the script with your huggingface token:
#
# python3 convert-hf-to-gguf-update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp
#
import logging
import os
import pathlib
import re
import requests
import sys
import json
from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
]
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
def download_model(model):
name = model["name"]
repo = model["repo"]
tokt = model["tokt"]
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
src_ifs = ""
for model in models:
name = model["name"]
tokt = model["tokt"]
if tokt == TOKENIZER_TYPE.SPM:
continue
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
if "ignore_merges" in cfg["model"]:
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n"
src_func = f"""
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
if res is None:
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
convert_py_pth.write_text(convert_py)
logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line
for model in models:
name = model["name"]
tokt = model["tokt"]
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
continue
# create the tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e:
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
continue # Skip this model and continue with the next one in the loop
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")
with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
for text in tests:
res = tokenizer.encode(text, add_special_tokens=False)
for r in res:
f.write(f" {r}")
f.write("\n")
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
# generate commands for creating vocab files
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
for model in models:
name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
logger.info("\n")

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import os import os
import struct import struct
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
logger = logging.getLogger("ggml-to-gguf")
class GGMLFormat(IntEnum): class GGMLFormat(IntEnum):
GGML = 0 GGML = 0
@ -125,7 +128,6 @@ class Tensor:
self.start_offset = offset self.start_offset = offset
self.len_bytes = n_bytes self.len_bytes = n_bytes
offset += n_bytes offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset return offset - orig_offset
@ -175,7 +177,7 @@ class GGMLModel:
offset += self.validate_header(data, offset) offset += self.validate_header(data, offset)
hp = Hyperparameters() hp = Hyperparameters()
offset += hp.load(data, offset) offset += hp.load(data, offset)
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype) self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab) offset += vocab.load(data, offset, hp.n_vocab)
@ -215,12 +217,12 @@ class GGMLToGGUF:
if float(hp.n_head) / float(x) == gqa: if float(hp.n_head) / float(x) == gqa:
n_kv_head = x n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self): def save(self):
print('* Preparing to save GGUF file') logger.info('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter( gguf_writer = gguf.GGUFWriter(
self.cfg.output, self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@ -230,11 +232,11 @@ class GGMLToGGUF:
if self.special_vocab is not None: if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer) self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer) self.add_tensors(gguf_writer)
print(" gguf: write header") logger.info(" gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print(" gguf: write metadata") logger.info(" gguf: write metadata")
gguf_writer.write_kv_data_to_file() gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors") logger.info(" gguf: write tensors")
gguf_writer.write_tensors_to_file() gguf_writer.write_tensors_to_file()
gguf_writer.close() gguf_writer.close()
@ -250,7 +252,7 @@ class GGMLToGGUF:
name = cfg.name if cfg.name is not None else cfg.input.name name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError: except UnicodeDecodeError:
name = None name = None
print('* Adding model parameters and KV items') logger.info('* Adding model parameters and KV items')
if name is not None: if name is not None:
gguf_writer.add_name(name) gguf_writer.add_name(name)
gguf_writer.add_description(desc) gguf_writer.add_description(desc)
@ -281,12 +283,13 @@ class GGMLToGGUF:
def add_vocab(self, gguf_writer): def add_vocab(self, gguf_writer):
hp = self.model.hyperparameters hp = self.model.hyperparameters
gguf_writer.add_tokenizer_model('llama') gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
if self.vocab_override is not None: if self.vocab_override is not None:
vo = self.vocab_override vo = self.vocab_override
print('* Adding vocab item(s)') logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes) tokens.append(vbytes)
scores.append(score) scores.append(score)
@ -298,7 +301,7 @@ class GGMLToGGUF:
if len(toktypes) > 0: if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes) gguf_writer.add_token_types(toktypes)
return return
print(f'* Adding {hp.n_vocab} vocab item(s)') logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal tt = 1 # Normal
@ -333,7 +336,7 @@ class GGMLToGGUF:
def add_tensors(self, gguf_writer): def add_tensors(self, gguf_writer):
tensor_map = self.name_map tensor_map = self.name_map
data = self.data data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)') logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors: for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8') name = str(tensor.name, 'UTF-8')
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@ -343,7 +346,6 @@ class GGMLToGGUF:
temp = tempdims[1] temp = tempdims[1]
tempdims[1] = tempdims[0] tempdims[1] = tempdims[0]
tempdims[0] = temp tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor( gguf_writer.add_tensor(
mapped_name, mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@ -400,33 +402,35 @@ def handle_args():
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", default="spm,hfft", parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
return parser.parse_args() return parser.parse_args()
def main(): def main():
cfg = handle_args() cfg = handle_args()
print(f'* Using config: {cfg}') logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') logger.info(f'* Using config: {cfg}')
logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel() model = GGMLModel()
print('* Scanning GGML input file') logger.info('* Scanning GGML input file')
offset = model.load(data, 0) # noqa offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}') logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None
special_vocab = None special_vocab = None
if cfg.model_metadata_dir is not None: if cfg.model_metadata_dir is not None:
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}') logger.info(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}') logger.info(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}') logger.info(f'* Special vocab: {special_vocab}')
else: else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML: if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF( converter = GGMLToGGUF(
model, data, cfg, model, data, cfg,
params_override = params_override, params_override = params_override,
@ -434,7 +438,7 @@ def main():
special_vocab = special_vocab special_vocab = special_vocab
) )
converter.save() converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}') logger.info(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -1,148 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any, BinaryIO, Sequence
import numpy as np
import torch
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
fout.write(b"ggla"[::-1]) # magic (ggml lora)
fout.write(struct.pack("i", 1)) # file version
fout.write(struct.pack("i", params["r"]))
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible
assert (
int(params["lora_alpha"]) == params["lora_alpha"]
), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
sname = name.encode("utf-8")
fout.write(
struct.pack(
"iii",
len(shape),
len(sname),
NUMPY_TYPE_TO_FTYPE[data_type.name],
)
)
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
fout.seek((fout.tell() + 31) & -32)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
if os.path.exists(input_model):
model = torch.load(input_model, map_location="cpu")
else:
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
model = load_file(input_model, device="cpu")
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
with open(input_json, "r") as f:
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
fout.truncate()
write_file_header(fout, params)
for k, v in model.items():
orig_k = k
if k.endswith(".default.weight"):
k = k.replace(".default.weight", ".weight")
if k in ["llama_proj.weight", "llama_proj.bias"]:
continue
if k.endswith("lora_A.weight"):
if v.dtype != torch.float16 and v.dtype != torch.float32:
v = v.float()
v = v.T
else:
v = v.float()
t = v.detach().numpy()
prefix = "base_model.model."
if k.startswith(prefix):
k = k[len(prefix) :]
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
if k.endswith(lora_suffixes):
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
tname += ".weight.loraA"
elif suffix == ".lora_B.weight":
tname += ".weight.loraB"
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")

View file

@ -1,138 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
new_prefix = prefix + '.' + key if prefix is not None else key
if isinstance(dct[key], torch.Tensor):
tensors[new_prefix] = dct[key]
elif isinstance(dct[key], dict):
_flatten_dict(dct[key], tensors, new_prefix)
else:
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
print('gguf: adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
args = parser.parse_args()
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
block_count = hparams.num_layers
head_count = hparams.num_attention_heads
head_count_kv = head_count
ctx_length = hparams.seq_length
hidden_size = hparams.hidden_size
gguf_writer.add_name('persimmon-8b-chat')
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{args.outfile}'")
print("")
if __name__ == '__main__':
main()

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import logging
import argparse import argparse
import concurrent.futures import concurrent.futures
import enum import enum
@ -23,7 +24,7 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
@ -35,6 +36,8 @@ import gguf
if TYPE_CHECKING: if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias from typing_extensions import Self, TypeAlias
logger = logging.getLogger("convert")
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1) faulthandler.register(signal.SIGUSR1)
@ -281,6 +284,7 @@ class Params:
n_experts = None n_experts = None
n_experts_used = None n_experts_used = None
f_rope_freq_base = None f_rope_freq_base = None
n_ff = None
# hack to determine LLaMA v1 vs v2 vs CodeLlama # hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("moe"): if config.get("moe"):
@ -305,6 +309,8 @@ class Params:
n_experts_used = config["moe"]["num_experts_per_tok"] n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6 f_rope_freq_base = 1e6
assert n_ff is not None
return Params( return Params(
n_vocab = model["tok_embeddings.weight"].shape[0], n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"], n_embd = config["dim"],
@ -338,10 +344,47 @@ class Params:
return params return params
@dataclass
class Metadata:
name: Optional[str] = None
author: Optional[str] = None
version: Optional[str] = None
url: Optional[str] = None
description: Optional[str] = None
licence: Optional[str] = None
source_url: Optional[str] = None
source_hf_repo: Optional[str] = None
@staticmethod
def load(metadata_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists():
return Metadata()
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance
metadata = Metadata()
# Assigning values to Metadata attributes if they exist in the JSON file
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get("general.name")
metadata.author = data.get("general.author")
metadata.version = data.get("general.version")
metadata.url = data.get("general.url")
metadata.description = data.get("general.description")
metadata.license = data.get("general.license")
metadata.source_url = data.get("general.source.url")
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
return metadata
# #
# vocab # vocab
# #
@runtime_checkable @runtime_checkable
class BaseVocab(Protocol): class BaseVocab(Protocol):
tokenizer_model: ClassVar[str] tokenizer_model: ClassVar[str]
@ -459,7 +502,8 @@ class SentencePieceVocab(Vocab):
# not found in alternate location either # not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model') raise FileNotFoundError('Cannot find tokenizer.model')
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) self.sentencepiece_tokenizer = SentencePieceProcessor()
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size() vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@ -479,23 +523,23 @@ class SentencePieceVocab(Vocab):
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()): for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i) piece = tokenizer.IdToPiece(i)
text = piece.encode("utf-8") text = piece.encode("utf-8")
score: float = tokenizer.get_score(i) score: float = tokenizer.GetScore(i)
toktype = gguf.TokenType.NORMAL toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i): if tokenizer.IsUnknown(i):
toktype = gguf.TokenType.UNKNOWN toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i): if tokenizer.IsControl(i):
toktype = gguf.TokenType.CONTROL toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined. # NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i): if tokenizer.IsUnused(i):
toktype = gguf.TokenType.UNUSED toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i): if tokenizer.IsByte(i):
toktype = gguf.TokenType.BYTE toktype = gguf.TokenType.BYTE
yield text, score, toktype yield text, score, toktype
@ -643,7 +687,6 @@ class LlamaHfVocab(Vocab):
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -904,7 +947,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state): def rebuild_from_type_v2(func, new_type, args, state):
return func(*args) return func(*args)
CLASSES = { CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
# getattr used here as a workaround for mypy not being smart enough to determine # getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute. # the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -1033,12 +1076,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
# Check for a vocab size mismatch # Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size: if params.n_vocab == vocab.vocab_size:
print("Ignoring added_tokens.json since model matches vocab size without it.") logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
return return
if pad_vocab and params.n_vocab > vocab.vocab_size: if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size pad_count = params.n_vocab - vocab.vocab_size
print( logger.debug(
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>" f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
) )
for i in range(1, pad_count + 1): for i in range(1, pad_count + 1):
@ -1060,16 +1103,37 @@ class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None: def add_meta_model(self, params: Params, metadata: Metadata) -> None:
# Metadata About The Model And Its Provenence
name = "LLaMA" name = "LLaMA"
if metadata is not None and metadata.name is not None:
# TODO: better logic to determine model name name = metadata.name
if params.n_ctx == 4096:
name = "LLaMA v2"
elif params.path_model is not None: elif params.path_model is not None:
name = str(params.path_model.parent).split('/')[-1] name = params.path_model.name
elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model
name = "LLaMA v2"
self.gguf.add_name(name) self.gguf.add_name(name)
if metadata is not None:
if metadata.author is not None:
self.gguf.add_author(metadata.author)
if metadata.version is not None:
self.gguf.add_version(metadata.version)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.description is not None:
self.gguf.add_description(metadata.description)
if metadata.licence is not None:
self.gguf.add_licence(metadata.licence)
if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None:
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
self.gguf.add_vocab_size(params.n_vocab) self.gguf.add_vocab_size(params.n_vocab)
self.gguf.add_context_length(params.n_ctx) self.gguf.add_context_length(params.n_ctx)
self.gguf.add_embedding_length(params.n_embd) self.gguf.add_embedding_length(params.n_embd)
@ -1166,7 +1230,7 @@ class OutputFile:
elapsed = time.time() - start elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model))) padi = len(str(len(model)))
print( logger.info(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
) )
self.gguf.write_tensor_data(ndarray) self.gguf.write_tensor_data(ndarray)
@ -1177,13 +1241,14 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only( def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
# meta data # meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params) of.add_meta_arch(params)
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab) of.add_meta_special_vocab(svocab)
@ -1210,12 +1275,14 @@ class OutputFile:
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False, pad_vocab: bool = False,
metadata: Metadata = None,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
# meta data # meta data
of.add_meta_model(params, metadata)
of.add_meta_arch(params) of.add_meta_arch(params)
if isinstance(vocab, Vocab): if isinstance(vocab, Vocab):
of.add_meta_vocab(vocab) of.add_meta_vocab(vocab)
@ -1251,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise ValueError(f"Unexpected combination of types: {name_to_type}") raise ValueError(f"Unexpected combination of types: {name_to_type}")
def model_parameter_count(model: LazyModel) -> int:
total_model_parameters = 0
for i, (name, lazy_tensor) in enumerate(model.items()):
sum_weights_in_tensor = 1
for dim in lazy_tensor.shape:
sum_weights_in_tensor *= dim
total_model_parameters += sum_weights_in_tensor
return total_model_parameters
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
if model_params_count > 1e12 :
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9 :
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6 :
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
return f"{round(scaled_model_params)}{scale_suffix}"
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
for (name, tensor) in model.items()} for (name, tensor) in model.items()}
@ -1281,12 +1379,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
# HF models permut or pack some of the tensors, so we need to undo that # HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}") logger.debug(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}") logger.debug(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@ -1299,15 +1397,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None: if name_new is None:
if skip_unknown: if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping") logger.warning(f"Unexpected tensor name: {name} - skipping")
continue continue
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip: if tensor_type in should_skip:
print(f"skipping tensor {name_new}") logger.debug(f"skipping tensor {name_new}")
continue continue
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
out[name_new] = lazy_tensor out[name_new] = lazy_tensor
return out return out
@ -1372,7 +1470,7 @@ def load_some_model(path: Path) -> ModelPlus:
paths = find_multifile_paths(path) paths = find_multifile_paths(path)
models_plus: list[ModelPlus] = [] models_plus: list[ModelPlus] = []
for path in paths: for path in paths:
print(f"Loading model file {path}") logger.info(f"Loading model file {path}")
models_plus.append(lazy_load_file(path)) models_plus.append(lazy_load_file(path))
model_plus = merge_multifile_models(models_plus) model_plus = merge_multifile_models(models_plus)
@ -1413,7 +1511,7 @@ class VocabFactory:
else: else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@ -1430,27 +1528,49 @@ class VocabFactory:
return vocab, special_vocab return vocab, special_vocab
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
namestr = { quantization = {
GGMLFileType.AllF32: "f32", GGMLFileType.AllF32: "F32",
GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyF16: "F16",
GGMLFileType.MostlyQ8_0:"q8_0", GGMLFileType.MostlyQ8_0: "Q8_0",
}[file_type] }[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
parameters = model_parameter_count_rounded_notation(model_params_count)
expert_count = ""
if params.n_experts is not None:
expert_count = f"{params.n_experts}x"
version = ""
if metadata is not None and metadata.version is not None:
version = f"-{metadata.version}"
name = "ggml-model"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
ret = model_paths[0].parent / f"{default_filename}.gguf"
if ret in model_paths: if ret in model_paths:
sys.stderr.write( logger.error(
f"Error: Default output path ({ret}) would overwrite the input. " f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n") "Please explicitly specify a path using --outfile.")
sys.exit(1) sys.exit(1)
return ret return ret
def do_dump_model(model_plus: ModelPlus) -> None: def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.paths = {model_plus.paths!r}") print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
print(f"model_plus.vocab = {model_plus.vocab!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
for name, lazy_tensor in model_plus.model.items(): for name, lazy_tensor in model_plus.model.items():
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
def main(args_in: list[str] | None = None) -> None: def main(args_in: list[str] | None = None) -> None:
@ -1473,8 +1593,31 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
elif args.dump_single or args.dump or args.get_outfile:
# Avoid printing anything besides the dump output
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.INFO)
metadata = Metadata.load(args.metadata)
if args.get_outfile:
model_plus = load_some_model(args.model)
params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = model_parameter_count(model_plus.model)
ftype = pick_output_type(model, args.outtype)
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
return
if args.no_vocab and args.vocab_only: if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab") raise ValueError("--vocab-only does not make sense with --no-vocab")
@ -1488,13 +1631,19 @@ def main(args_in: list[str] | None = None) -> None:
else: else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model_params_count = model_parameter_count(model_plus.model)
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
if args.dump: if args.dump:
do_dump_model(model_plus) do_dump_model(model_plus)
return return
endianess = gguf.GGUFEndian.LITTLE endianess = gguf.GGUFEndian.LITTLE
if args.big_endian: if args.big_endian:
endianess = gguf.GGUFEndian.BIG endianess = gguf.GGUFEndian.BIG
params = None
if args.pad_vocab or not args.vocab_only:
params = Params.load(model_plus) params = Params.load(model_plus)
if params.n_ctx == -1: if params.n_ctx == -1:
if args.ctx is None: if args.ctx is None:
@ -1513,7 +1662,7 @@ def main(args_in: list[str] | None = None) -> None:
"q8_0": GGMLFileType.MostlyQ8_0, "q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype] }[args.outtype]
print(f"params = {params}") logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@ -1526,29 +1675,39 @@ def main(args_in: list[str] | None = None) -> None:
if not args.outfile: if not args.outfile:
raise ValueError("need --outfile if using --vocab-only") raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile outfile = args.outfile
if params is None:
params = Params(
n_vocab = vocab.vocab_size,
n_embd = 1,
n_layer = 1,
n_ctx = 1,
n_ff = 1,
n_head = 1,
n_head_kv = 1,
f_norm_eps = 1e-5,
)
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab) endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
print(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")
return return
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab vocab = model_plus.vocab
print(f"Vocab info: {vocab}") logger.info(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}") logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown) model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype) ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype) model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype) outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
params.ftype = ftype params.ftype = ftype
print(f"Writing {outfile}, format {ftype}") logger.info(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
print(f"Wrote {outfile}") logger.info(f"Wrote {outfile}")
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -23,7 +23,7 @@ Install BLIS:
sudo make install sudo make install
``` ```
We recommend using openmp since it's easier to modify the cores been used. We recommend using openmp since it's easier to modify the cores being used.
### llama.cpp compilation ### llama.cpp compilation

View file

@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`. Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).

104
docs/debugging-tests.md Normal file
View file

@ -0,0 +1,104 @@
# Debugging Tests Tips
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
For example, running the following command will output an interactive list from which you can select a test. It takes this form:
`debug-test.sh [OPTION]... <test_regex> <test_number>`
It will then build & run in the debugger for you.
To just execute a test and get back a PASS or FAIL message run:
```bash
./scripts/debug-test.sh test-tokenizer
```
To test in GDB use the `-g` flag to enable gdb test mode.
```bash
./scripts/debug-test.sh -g test-tokenizer
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
>>> b main
```
To speed up the testing loop, if you know your test number you can just run it similar to below:
```bash
./scripts/debug-test.sh test 23
```
For further reference use `debug-test.sh -h` to print help.
&nbsp;
### How does the script work?
If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
#### Step 1: Reset and Setup folder context
From base of this repository, let's create `build-ci-debug` as our build context.
```bash
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
```
#### Step 2: Setup Build Environment and Compile Test Binaries
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
```bash
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
make -j
```
#### Step 3: Find all tests available that matches REGEX
The output of this command will give you the command & arguments needed to run GDB.
* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
* `-V` : Verbose Mode
```bash
ctest -R "test-tokenizer" -V -N
```
This may return output similar to below (focusing on key lines to pay attention to):
```bash
...
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
1: Working Directory: .
Labels: main
Test #1: test-tokenizer-0-llama-spm
...
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
4: Working Directory: .
Labels: main
Test #4: test-tokenizer-0-falcon
...
```
#### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
#### Step 5: Run GDB on test command
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
```bash
gdb --args ${Test Binary} ${Test GGUF Model}
```
Example:
```bash
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
```

View file

@ -49,4 +49,7 @@ else()
add_subdirectory(server) add_subdirectory(server)
endif() endif()
add_subdirectory(export-lora) add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif() endif()

View file

@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]); printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n"); printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ; return 1 ;
@ -41,6 +41,7 @@ int main(int argc, char ** argv) {
int n_kv_max = 2048; int n_kv_max = 2048;
int n_batch = 2048; int n_batch = 2048;
int n_ubatch = 512; int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0; int is_pp_shared = 0;
int n_gpu_layers = 0; int n_gpu_layers = 0;
@ -66,23 +67,27 @@ int main(int argc, char ** argv) {
} }
if (argc >= 6) { if (argc >= 6) {
is_pp_shared = std::atoi(argv[5]); flash_attn = std::atoi(argv[5]);
} }
if (argc >= 7) { if (argc >= 7) {
n_gpu_layers = std::atoi(argv[6]); is_pp_shared = std::atoi(argv[6]);
} }
if (argc >= 8) { if (argc >= 8) {
n_pp = parse_list(argv[7]); n_gpu_layers = std::atoi(argv[7]);
} }
if (argc >= 9) { if (argc >= 9) {
n_tg = parse_list(argv[8]); n_pp = parse_list(argv[8]);
} }
if (argc >= 10) { if (argc >= 10) {
n_pl = parse_list(argv[9]); n_tg = parse_list(argv[9]);
}
if (argc >= 11) {
n_pl = parse_list(argv[10]);
} }
// init LLM // init LLM
@ -112,6 +117,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_max; ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch; ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch; ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -169,7 +175,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
process_escapes(params.prompt); string_process_escapes(params.prompt);
// init LLM // init LLM

View file

@ -2,7 +2,7 @@
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository: To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j` `$ make -j`

View file

@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
} }
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
/*fprintf(stdout, "unnormalized_embedding:");
for (int hh = 0; hh < n_embd; hh++) {
fprintf(stdout, "%9.6f ", embd[hh]);
}
fprintf(stdout, "\n");*/
llama_embd_normalize(embd, out, n_embd); llama_embd_normalize(embd, out, n_embd);
} }
} }
@ -74,7 +80,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -101,7 +107,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// split the prompt into lines // split the prompt into lines
@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
inputs.push_back(inp); inputs.push_back(inp);
} }
// add SEP if not present // check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) { for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) { if (inp.empty() || inp.back() != llama_token_sep(model)) {
inp.push_back(llama_token_sep(model)); fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
} }
} }
@ -203,6 +211,7 @@ int main(int argc, char ** argv) {
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View file

@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v; float v;
if (type == GGML_TYPE_F16) { if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) { } else if (type == GGML_TYPE_F32) {
v = *(float *) data + i; v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) { } else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i; v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) { } else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i; v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) { } else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i; v = (float) *(int8_t *) &data[i];
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = run(ctx, params); bool OK = run(ctx, params);

View file

@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_ext(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };
@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) { if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) { } else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b); return ggml_add(ctx, a, b);
@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
struct ggml_tensor * t16; struct ggml_tensor * t16;
if (enable_flash_attn) { if (enable_flash_attn) {
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
} else { } else {
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);

View file

@ -32,6 +32,7 @@ struct split_params {
int n_split_tensors = 128; int n_split_tensors = 128;
std::string input; std::string input;
std::string output; std::string output;
bool no_tensor_first_split = false;
bool dry_run = false; bool dry_run = false;
}; };
@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
printf(" --merge merge multiple GGUF to a single GGUF\n"); printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n"); printf(" --split-max-size N(M|G) max size per split\n");
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n"); printf("\n");
} }
@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
arg_found = true; arg_found = true;
params.dry_run = true; params.dry_run = true;
} }
if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) { if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@ -200,10 +206,10 @@ struct split_strategy {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1; int i_split = -1;
struct gguf_context * ctx_out = NULL; struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&]() { auto new_ctx_out = [&](bool allow_no_tensors) {
i_split++; i_split++;
if (ctx_out != NULL) { if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0) { if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -220,7 +226,12 @@ struct split_strategy {
}; };
// initialize ctx_out for the first split // initialize ctx_out for the first split
new_ctx_out(); new_ctx_out(false);
// skip first split if no_tensor_first_split is set
if (params.no_tensor_first_split) {
new_ctx_out(true);
}
// process tensors one by one // process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@ -230,7 +241,7 @@ struct split_strategy {
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes; size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) { if (should_split(i, next_tensors_size)) {
new_ctx_out(); new_ctx_out(false);
curr_tensors_size = n_bytes; curr_tensors_size = n_bytes;
} else { } else {
curr_tensors_size = next_tensors_size; curr_tensors_size = next_tensors_size;

View file

@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
echo PASS echo PASS
echo echo
# 4. Split with no tensor in metadata # 4. Split with no tensors in the first split
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors $SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
#echo PASS echo PASS
#echo echo
# 4b. Test the sharded model is loading properly # 4b. Test the sharded model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32 $MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
#echo PASS echo PASS
#echo echo
# 5. Merge # 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf #$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf

View file

@ -19,6 +19,7 @@
struct Stats { struct Stats {
std::vector<float> values; std::vector<float> values;
std::vector<int> counts;
int ncall = 0; int ncall = 0;
}; };
@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname]; auto & e = m_stats[wname];
++e.ncall; ++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0); e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]*n_as) { else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j]; e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
} }
} }
} }
@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto& e = m_stats[wname]; auto& e = m_stats[wname];
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
} }
else if (e.values.size() != (size_t)src1->ne[0]) { else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * x = data + row * src1->ne[0]; const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) { for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j]; e.values[j] += x[j]*x[j];
e.counts[j]++;
} }
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {
@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall)); out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size(); int nval = p.second.values.size();
out.write((const char *) &nval, sizeof(nval)); out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float)); if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
} }
// Write the number of call the matrix was computed with // Write the number of call the matrix was computed with
@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
imatrix_data = {}; imatrix_data = {};
return false; return false;
} }
e.values.resize(nval);
in.read((char*)e.values.data(), nval*sizeof(float)); // When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i); printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {}; imatrix_data = {};
return false; return false;
} }
e.ncall = ncall;
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;
} }
return true; return true;
} }
@ -576,7 +598,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
sparams.dataset = params.prompt_file; sparams.dataset = params.prompt_file;
@ -645,7 +667,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);

View file

@ -50,9 +50,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -70,7 +70,7 @@ static void write_logfile(
fprintf(logfile, "binary: infill\n"); fprintf(logfile, "binary: infill\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -78,8 +78,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(llama_add_eos_token(model) != 1);
@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
if (params.escape) { if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
process_escapes(params.input_prefix); string_process_escapes(params.input_prefix);
process_escapes(params.input_suffix); string_process_escapes(params.input_suffix);
} }
suff_rm_leading_spc = params.escape; suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {

View file

@ -26,16 +26,21 @@ options:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-b, --batch-size <n> (default: 512) -pg <pp,tg> (default: 512,128)
-ctk <t>, --cache-type-k <t> (default: f16) -b, --batch-size <n> (default: 2048)
-ctv <t>, --cache-type-v <t> (default: f16) -ub, --ubatch-size <n> (default: 512)
-t, --threads <n> (default: 112) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16)
-ngl, --n-gpu-layers <n> (default: 99) -ngl, --n-gpu-layers <n> (default: 99)
-sm, --split-mode <none|layer|row> (default: layer) -sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0) -mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1) -mmp, --mmap <0|1> (default: 1)
-ts, --tensor_split <ts0/ts1/..> (default: 0) --numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5) -r, --repetitions <n> (default: 5)
-o, --output <csv|json|md|sql> (default: md) -o, --output <csv|json|md|sql> (default: md)
-v, --verbose (default: 0) -v, --verbose (default: 0)
@ -43,10 +48,11 @@ options:
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
``` ```
llama-bench can perform two types of tests: llama-bench can perform three types of tests:
- Prompt processing (pp): processing a prompt in batches (`-p`) - Prompt processing (pp): processing a prompt in batches (`-p`)
- Text generation (tg): generating a sequence of tokens (`-n`) - Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).

View file

@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
} }
} }
static std::string pair_str(const std::pair<int, int> & p) {
static char buf[32];
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
return buf;
}
struct cmd_params { struct cmd_params {
std::vector<std::string> model; std::vector<std::string> model;
std::vector<int> n_prompt; std::vector<int> n_prompt;
std::vector<int> n_gen; std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
std::vector<int> n_batch; std::vector<int> n_batch;
std::vector<int> n_ubatch; std::vector<int> n_ubatch;
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
@ -174,9 +181,11 @@ struct cmd_params {
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu; std::vector<int> main_gpu;
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
std::vector<std::vector<float>> tensor_split; std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap; std::vector<bool> use_mmap;
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -186,18 +195,21 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_pg */ {},
/* n_batch */ {2048}, /* n_batch */ {2048},
/* n_ubatch */ {512}, /* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_math_cpu_count()}, /* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0}, /* main_gpu */ {0},
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* flash_attn */ {false},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true}, /* use_mmap */ {true},
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -211,16 +223,19 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -298,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
}
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
} else if (arg == "-b" || arg == "--batch-size") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -393,6 +419,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") { } else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -469,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.model.empty()) { params.model = cmd_params_defaults.model; } if (params.model.empty()) { params.model = cmd_params_defaults.model; }
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
@ -477,6 +522,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
@ -498,6 +544,7 @@ struct cmd_params_instance {
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings; bool embeddings;
@ -532,6 +579,7 @@ struct cmd_params_instance {
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload; cparams.offload_kqv = !no_kv_offload;
cparams.flash_attn = flash_attn;
cparams.embeddings = embeddings; cparams.embeddings = embeddings;
return cparams; return cparams;
@ -554,6 +602,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) { if (n_prompt == 0) {
@ -572,6 +621,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd, /* .embeddings = */ embd,
@ -596,6 +646,32 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
/* .embeddings = */ embd,
};
instances.push_back(instance);
}
for (const auto & n_pg : params.n_pg) {
if (n_pg.first == 0 && n_pg.second == 0) {
continue;
}
cmd_params_instance instance = {
/* .model = */ m,
/* .n_prompt = */ n_pg.first,
/* .n_gen = */ n_pg.second,
/* .n_batch = */ nb,
/* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp, /* .use_mmap = */ mmp,
/* .embeddings = */ embd, /* .embeddings = */ embd,
@ -633,6 +709,7 @@ struct test {
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool flash_attn;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap; bool use_mmap;
bool embeddings; bool embeddings;
@ -657,6 +734,7 @@ struct test {
split_mode = inst.split_mode; split_mode = inst.split_mode;
main_gpu = inst.main_gpu; main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload; no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
tensor_split = inst.tensor_split; tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap; use_mmap = inst.use_mmap;
embeddings = inst.embeddings; embeddings = inst.embeddings;
@ -731,7 +809,7 @@ struct test {
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
@ -753,7 +831,7 @@ struct test {
} }
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -787,7 +865,7 @@ struct test {
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -933,6 +1011,9 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") { if (field == "n_gpu_layers") {
return 3; return 3;
} }
if (field == "test") {
return 13;
}
int width = std::max((int)field.length(), 10); int width = std::max((int)field.length(), 10);
@ -955,6 +1036,9 @@ struct markdown_printer : public printer {
if (field == "no_kv_offload") { if (field == "no_kv_offload") {
return "nkvo"; return "nkvo";
} }
if (field == "flash_attn") {
return "fa";
}
if (field == "use_mmap") { if (field == "use_mmap") {
return "mmap"; return "mmap";
} }
@ -1001,6 +1085,9 @@ struct markdown_printer : public printer {
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
fields.emplace_back("no_kv_offload"); fields.emplace_back("no_kv_offload");
} }
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
fields.emplace_back("flash_attn");
}
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split"); fields.emplace_back("tensor_split");
} }
@ -1057,8 +1144,7 @@ struct markdown_printer : public printer {
} else if (t.n_gen > 0 && t.n_prompt == 0) { } else if (t.n_gen > 0 && t.n_prompt == 0) {
snprintf(buf, sizeof(buf), "tg%d", t.n_gen); snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
} else { } else {
assert(false); snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
exit(1);
} }
value = buf; value = buf;
} else if (field == "t/s") { } else if (field == "t/s") {
@ -1191,6 +1277,7 @@ int main(int argc, char ** argv) {
llama_log_set(llama_null_log_callback, NULL); llama_log_set(llama_null_log_callback, NULL);
} }
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa);
// initialize printer // initialize printer
std::unique_ptr<printer> p; std::unique_ptr<printer> p;
@ -1258,6 +1345,7 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns(); uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
} }

View file

@ -12,15 +12,20 @@ cmake_minimum_required(VERSION 3.22.1)
# build script scope). # build script scope).
project("llama-android") project("llama-android")
include(FetchContent) ## Fetch latest llama.cpp from GitHub
FetchContent_Declare( #include(FetchContent)
llama #FetchContent_Declare(
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp # llama
GIT_TAG master # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
) # GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# Also provides "common" # llama.cpp CI uses the code from the current branch
FetchContent_MakeAvailable(llama) # ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC # Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code. # or SHARED, and provides the relative paths to its source code.

View file

@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
python ./convert.py ../llava-v1.5-7b --skip-unknown python ./convert.py ../llava-v1.5-7b --skip-unknown
``` ```
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion ## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model: 1) First clone a LLaVA 1.6 model:

View file

@ -573,13 +573,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_tensor * embeddings = inp; struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) { if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp, embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
} }
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
@ -1846,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -1874,6 +1874,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
{ {
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings)); void* zero_mem = malloc(ggml_nbytes(embeddings));
@ -1881,6 +1882,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem); free(zero_mem);
} }
}
{ {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");

View file

@ -113,11 +113,11 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
// load and preprocess the image // load and preprocess the image
llava_image_embed * embed = NULL; llava_image_embed * embed = NULL;
@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
if (!embed) { if (!embed) {
LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str()); fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL; return NULL;
} }
} }
@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
LOG_TEE("\n"); LOG_TEE("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
std::string response = ""; std::string response = "";
for (int i = 0; i < max_tgt_len; i++) { for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
@ -207,17 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
printf("\n"); printf("\n");
} }
static struct llama_model * llava_init(gpt_params * params) {
static struct llava_context * llava_init(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_backend_init(); llama_backend_init();
llama_numa_init(params->numa); llama_numa_init(params->numa);
@ -228,6 +223,19 @@ static struct llava_context * llava_init(gpt_params * params) {
LOG_TEE("%s: error: unable to load model\n" , __func__); LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL; return NULL;
} }
return model;
}
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@ -282,19 +290,35 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params); gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
auto model = llava_init(&params);
auto ctx_llava = llava_init(&params); if (model == NULL) {
if (ctx_llava == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
LOG_TEE("%s: error: failed to init llava\n", __func__);
return 1; return 1;
} }
auto image_embed = load_image(ctx_llava, &params); if (prompt_contains_image(params.prompt)) {
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, "");
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
auto ctx_llava = llava_init_context(&params, model);
auto image_embed = load_image(ctx_llava, &params, image);
if (!image_embed) { if (!image_embed) {
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
return 1; return 1;
} }
@ -302,8 +326,13 @@ int main(int argc, char ** argv) {
process_prompt(ctx_llava, image_embed, &params, params.prompt); process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama); llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed); llava_image_embed_free(image_embed);
ctx_llava->model = NULL;
llava_free(ctx_llava); llava_free(ctx_llava);
}
}
llama_free_model(model);
return 0; return 0;
} }

View file

@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct { struct {
struct ggml_tensor * newline;
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) {
LOG_TEE("newline_tmp tensor buffer is NULL\n");
}
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else {
model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) {
LOG_TEE("newline_tmp tensor data is NULL\n");
}
}
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base // fill it with the image embeddings, ignoring the base

View file

@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/

View file

@ -121,7 +121,7 @@ int main(int argc, char ** argv){
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// print current draft sequence // print current draft sequence

View file

@ -17,11 +17,9 @@ In this case, CLBlast was already installed so the CMake package is referenced i
```cmd ```cmd
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/LlamaCPP
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
``` ```
### Build main-cmake-pkg ### Build main-cmake-pkg
@ -29,9 +27,7 @@ cmake --install . --prefix C:/LlamaCPP
```cmd ```cmd
cd ..\examples\main-cmake-pkg cd ..\examples\main-cmake-pkg
mkdir build cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cd build cmake --build build --config Release
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 cmake --install build --prefix C:/MyLlamaApp
cmake --build . --config Release
cmake --install . --prefix C:/MyLlamaApp
``` ```

View file

@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
### Extended Context Size ### Extended Context Size
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. - `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes. - `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node. - `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus. - `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

View file

@ -60,9 +60,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -80,7 +80,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -88,8 +88,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
@ -324,7 +324,7 @@ int main(int argc, char ** argv) {
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force // if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token token to recalculate the cached logits // reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
@ -362,6 +362,9 @@ int main(int argc, char ** argv) {
params.interactive_first = true; params.interactive_first = true;
params.antiprompt.emplace_back("<|im_start|>user\n"); params.antiprompt.emplace_back("<|im_start|>user\n");
} }
else if (params.conversation) {
params.interactive_first = true;
}
// enable interactive mode if interactive start is specified // enable interactive mode if interactive start is specified
if (params.interactive_first) { if (params.interactive_first) {
@ -473,10 +476,10 @@ int main(int argc, char ** argv) {
if (params.interactive) { if (params.interactive) {
const char * control_message; const char * control_message;
if (params.multiline_input) { if (params.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n" control_message = " - To return control to the AI, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n"; " - To return control without starting a new line, end your input with '/'.\n";
} else { } else {
control_message = " - Press Return to return control to LLaMa.\n" control_message = " - Press Return to return control to the AI.\n"
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
@ -520,6 +523,10 @@ int main(int argc, char ** argv) {
} }
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
if (!ctx_sampling) {
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
@ -544,7 +551,7 @@ int main(int argc, char ** argv) {
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break; break;
@ -700,7 +707,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -721,7 +728,7 @@ int main(int argc, char ** argv) {
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -733,7 +740,7 @@ int main(int argc, char ** argv) {
// display text // display text
if (input_echo && display) { if (input_echo && display) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id); const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
printf("%s", token_str.c_str()); printf("%s", token_str.c_str());
if (embd.size() > 1) { if (embd.size() > 1) {
@ -796,7 +803,7 @@ int main(int argc, char ** argv) {
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n"); LOG("found an EOG token\n");
if (params.interactive) { if (params.interactive) {
if (!params.antiprompt.empty()) { if (!params.antiprompt.empty()) {
@ -816,7 +823,7 @@ int main(int argc, char ** argv) {
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n"); LOG("waiting for user input\n");
if (params.instruct || params.chatml) { if (params.conversation || params.instruct || params.chatml) {
printf("\n> "); printf("\n> ");
} }
@ -826,7 +833,7 @@ int main(int argc, char ** argv) {
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str()); printf("%s", params.input_prefix.c_str());
} }
@ -850,7 +857,7 @@ int main(int argc, char ** argv) {
// Entering a empty line lets the user pass control back // Entering a empty line lets the user pass control back
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str()); printf("%s", params.input_suffix.c_str());
} }
@ -872,11 +879,11 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
} }
if (params.escape) { if (params.escape) {
process_escapes(buffer); string_process_escapes(buffer);
} }
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

View file

@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
llama_batch_clear(batch); llama_batch_clear(batch);

View file

@ -1,8 +1,179 @@
# perplexity # Perplexity
TODO The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
Perplexity measures how well the model can predict the next token with lower values being better.
Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
By default only the mean perplexity value and the corresponding uncertainty is calculated.
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
More statistics can be obtained by recording the logits from the FP16 version of a model.
To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
The program will then record all logits and save them to the provided path in binary format.
**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
* Pearson correlation coefficient of the "correct" token probabilites between models.
* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
## LLaMA 3 8b Scoreboard
| Revision | f364eb6f |
|:---------|:-------------------|
| Backend | CUDA |
| CPU | AMD Epyc 7742 |
| GPU | 1x NVIDIA RTX 4090 |
Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
So the "f16" results are to be understood as the difference resulting only from this downcast.
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
| f16 | None | 14.97 | 6.233160 ± 0.037828 | 0.001524 ± 0.000755 | 0.000551 ± 0.000002 | 0.001 ± 0.002 % | 0.787 ± 0.004 % |
| q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % |
| q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % |
| q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % |
| q5_K_S | None | 5.21 | 6.336598 ± 0.038755 | 0.104964 ± 0.003331 | 0.016595 ± 0.000122 | -0.223 ± 0.010 % | 3.918 ± 0.036 % |
| q5_1 | None | 5.65 | 6.337857 ± 0.038677 | 0.106223 ± 0.003476 | 0.018045 ± 0.000139 | -0.287 ± 0.011 % | 4.123 ± 0.039 % |
| q5_0 | None | 5.21 | 6.363224 ± 0.038861 | 0.131591 ± 0.003894 | 0.022239 ± 0.000166 | -0.416 ± 0.012 % | 4.634 ± 0.043 % |
| q4_K_M | WT 10m | 4.58 | 6.382937 ± 0.039055 | 0.151303 ± 0.004429 | 0.028152 ± 0.000240 | -0.389 ± 0.014 % | 5.251 ± 0.049 % |
| q4_K_M | None | 4.58 | 6.407115 ± 0.039119 | 0.175482 ± 0.004620 | 0.031273 ± 0.000238 | -0.596 ± 0.014 % | 5.519 ± 0.050 % |
| q4_K_S | WT 10m | 4.37 | 6.409697 ± 0.039189 | 0.178064 ± 0.004744 | 0.031951 ± 0.000259 | -0.531 ± 0.015 % | 5.645 ± 0.051 % |
| iq4_NL | WT 10m | 4.35 | 6.455593 ± 0.039630 | 0.223959 ± 0.005201 | 0.035742 ± 0.000288 | -0.590 ± 0.016 % | 5.998 ± 0.054 % |
| iq4_XS | WT 10m | 4.14 | 6.459705 ± 0.039595 | 0.228071 ± 0.005207 | 0.036334 ± 0.000284 | -0.668 ± 0.016 % | 6.044 ± 0.054 % |
| q4_K_S | None | 4.37 | 6.500529 ± 0.039778 | 0.268895 ± 0.005638 | 0.043136 ± 0.000314 | -0.927 ± 0.017 % | 6.562 ± 0.055 % |
| q4_1 | None | 4.78 | 6.682737 ± 0.041285 | 0.451103 ± 0.008030 | 0.071683 ± 0.000505 | -0.927 ± 0.017 % | 8.512 ± 0.063 % |
| q4_0 | None | 4.34 | 6.700147 ± 0.041226 | 0.468514 ± 0.007951 | 0.071940 ± 0.000491 | -1.588 ± 0.022 % | 8.434 ± 0.061 % |
| q3_K_L | WT 10m | 4.03 | 6.671223 ± 0.041427 | 0.439590 ± 0.008154 | 0.073077 ± 0.000529 | -0.940 ± 0.023 % | 8.662 ± 0.064 % |
| q3_K_M | WT 10m | 3.74 | 6.734255 ± 0.041838 | 0.502622 ± 0.008901 | 0.084358 ± 0.000588 | -1.198 ± 0.024 % | 9.292 ± 0.065 % |
| q3_K_L | None | 4.03 | 6.787876 ± 0.042104 | 0.556242 ± 0.009171 | 0.087176 ± 0.000614 | -1.532 ± 0.025 % | 9.432 ± 0.067 % |
| q3_K_M | None | 3.74 | 6.888498 ± 0.042669 | 0.656864 ± 0.010071 | 0.101913 ± 0.000677 | -1.990 ± 0.026 % | 10.203 ± 0.068 % |
| iq3_M | WT 10m | 3.53 | 6.898327 ± 0.041643 | 0.666694 ± 0.009449 | 0.102534 ± 0.000663 | -3.178 ± 0.026 % | 10.513 ± 0.066 % |
| iq3_S | WT 10m | 3.42 | 6.965501 ± 0.042406 | 0.733867 ± 0.010245 | 0.111278 ± 0.000710 | -3.066 ± 0.027 % | 10.845 ± 0.068 % |
| iq3_XS | WT 10m | 3.28 | 7.163043 ± 0.043772 | 0.931409 ± 0.012084 | 0.138693 ± 0.000857 | -3.667 ± 0.031 % | 12.148 ± 0.070 % |
| iq3_XXS | WT 10m | 3.05 | 7.458436 ± 0.046404 | 1.226803 ± 0.015234 | 0.183625 ± 0.001042 | -3.918 ± 0.035 % | 13.836 ± 0.074 % |
| q3_K_S | WT 10m | 3.41 | 7.602878 ± 0.046848 | 1.371244 ± 0.015688 | 0.199821 ± 0.001008 | -5.046 ± 0.037 % | 14.980 ± 0.070 % |
| q3_K_S | None | 3.41 | 7.863786 ± 0.048885 | 1.632152 ± 0.017733 | 0.228217 ± 0.001079 | -5.604 ± 0.038 % | 15.541 ± 0.070 % |
| iq2_M | WT 10m | 2.74 | 8.600799 ± 0.055124 | 2.369166 ± 0.025244 | 0.325989 ± 0.00160 | -6.463 ± 0.046 % | 18.519 ± 0.080 % |
| q2_K | WT 10k | 2.96 | 8.652290 ± 0.055572 | 2.420657 ± 0.025587 | 0.331393 ± 0.001562 | -6.606 ± 0.046 % | 18.790 ± 0.078 % |
| q2_K | WT 100k | 2.96 | 8.641993 ± 0.055406 | 2.410359 ± 0.025495 | 0.331672 ± 0.001569 | -6.628 ± 0.047 % | 18.856 ± 0.078 % |
| q2_K | WT 10m | 2.96 | 8.647825 ± 0.055610 | 2.416191 ± 0.025683 | 0.332223 ± 0.001572 | -6.500 ± 0.047 % | 18.881 ± 0.078 % |
| q2_K | WT 1m | 2.96 | 8.674365 ± 0.055743 | 2.442732 ± 0.025843 | 0.335308 ± 0.001576 | -6.634 ± 0.047 % | 19.009 ± 0.079 % |
| q2_K | WT 1k | 2.96 | 8.682605 ± 0.055916 | 2.450972 ± 0.026069 | 0.337093 ± 0.001596 | -6.596 ± 0.047 % | 18.977 ± 0.079 % |
| q2_K_S | WT 10m | 2.96 | 9.323778 ± 0.061551 | 3.092145 ± 0.031914 | 0.403360 ± 0.001787 | -7.131 ± 0.049 % | 20.050 ± 0.081 % |
| q2_K_S | WT 1m | 2.96 | 9.329321 ± 0.061378 | 3.097688 ± 0.031816 | 0.403590 ± 0.001797 | -7.289 ± 0.049 % | 20.123 ± 0.081 % |
| q2_K_S | WT 100k | 2.96 | 9.362973 ± 0.061740 | 3.131339 ± 0.032169 | 0.408367 ± 0.001802 | -7.198 ± 0.050 % | 20.132 ± 0.081 % |
| q2_K_S | WT 10k | 2.96 | 9.376479 ± 0.062045 | 3.144846 ± 0.032464 | 0.408662 ± 0.001819 | -7.141 ± 0.050 % | 20.120 ± 0.081 % |
| q2_K_S | WT 1k | 2.96 | 9.415200 ± 0.062475 | 3.183567 ± 0.032993 | 0.415865 ± 0.001846 | -7.153 ± 0.050 % | 20.311 ± 0.082 % |
| iq2_S | WT 10m | 2.56 | 9.650781 ± 0.063209 | 3.419148 ± 0.034017 | 0.439197 ± 0.001976 | -8.319 ± 0.052 % | 21.491 ± 0.083 % |
| q2_K | None | 2.96 | 9.751568 ± 0.063312 | 3.519934 ± 0.033863 | 0.445132 ± 0.001835 | -9.123 ± 0.051 % | 21.421 ± 0.079 % |
| iq2_XS | WT 10m | 2.43 | 10.761424 ± 0.071056 | 4.529791 ± 0.042229 | 0.546290 ± 0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
| iq2_XXS | WT 10m | 2.24 | 14.091782 ± 0.098396 | 7.860148 ± 0.070752 | 0.812022 ± 0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
| iq1_M | WT 10m | 2.01 | 25.493722 ± 0.177903 | 19.262089 ± 0.152396 | 1.393084 ± 0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
| iq1_S | WT 1m | 1.88 | 58.097760 ± 0.438604 | 51.866126 ± 0.416604 | 2.211278 ± 0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
| iq1_S | WT 1k | 1.88 | 58.267851 ± 0.446208 | 52.036218 ± 0.424373 | 2.214858 ± 0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
| iq1_S | WT 100k | 1.88 | 58.581498 ± 0.453145 | 52.349864 ± 0.431360 | 2.220834 ± 0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
| iq1_S | WT 10m | 1.88 | 60.694593 ± 0.471290 | 54.462959 ± 0.449644 | 2.254554 ± 0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
| iq1_S | WT 10k | 1.88 | 63.221324 ± 0.493077 | 56.989691 ± 0.471423 | 2.293527 ± 0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
## LLaMA 2 vs. LLaMA 3 Quantization comparison
| Revision | f364eb6f |
|:---------|:-------------------|
| Backend | CUDA |
| CPU | AMD Epyc 7742 |
| GPU | 1x NVIDIA RTX 4090 |
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
| Mean PPL ratio | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
| Mean ΔPPL | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
| PPL correlation | 97.36% | 89.62% | 99.71% | 99.34% | 99.94% | 99.88% | 99.98% | 99.96% |
| Mean KLD | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
| Mean Δp | -2.710 ± 0.023 % | -9.123 ± 0.051 % | -0.416 ± 0.008 % | -0.596 ± 0.014 % | -0.035 ± 0.003 % | -0.007 ± 0.006 % | -0.005 ± 0.002 % | -0.019 ± 0.003 % |
| Maximum Δp | 85.136% | 94.268% | 45.209% | 95.054% | 23.593% | 53.601% | 43.925% | 28.734% |
| 99.9% Δp | 37.184% | 50.003% | 17.461% | 27.084% | 7.798% | 13.613% | 3.387% | 6.402% |
| 99.0% Δp | 18.131% | 25.875% | 7.798% | 12.084% | 3.838% | 6.407% | 1.867% | 3.544% |
| Median Δp | -0.391% | -2.476% | -0.026% | -0.024% | -0.001% | 0.000% | -0.000% | -0.000% |
| 1.0% Δp | -39.762% | -87.173% | -11.433% | -19.567% | -4.222% | -6.767% | -1.862% | -3.698% |
| 0.1% Δp | -79.002% | -98.897% | -26.433% | -56.054% | -9.091% | -16.584% | -3.252% | -6.579% |
| Minimum Δp | -99.915% | -99.965% | -83.383% | -98.699% | -43.142% | -68.487% | -9.343% | -24.301% |
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
## LLaMA 3 BF16 vs. FP16 comparison
| Revision | 83330d8c |
|:---------|:--------------|
| Backend | CPU |
| CPU | AMD Epyc 7742 |
| GPU | N/A |
Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
| Metric | Value |
|--------------------------------|--------------------------|
| Mean PPL(Q) | 6.227711 ± 0.037833 |
| Mean PPL(base) | 6.225194 ± 0.037771 |
| Cor(ln(PPL(Q)), ln(PPL(base))) | 99.990% |
| Mean ln(PPL(Q)/PPL(base)) | 0.000404 ± 0.000086 |
| Mean PPL(Q)/PPL(base) | 1.000404 ± 0.000086 |
| Mean PPL(Q)-PPL(base) | 0.002517 ± 0.000536 |
| Mean KLD | 0.00002515 ± 0.00000020 |
| Maximum KLD | 0.012206 |
| 99.9% KLD | 0.000799 |
| 99.0% KLD | 0.000222 |
| 99.0% KLD | 0.000222 |
| Median KLD | 0.000013 |
| 10.0% KLD | -0.000002 |
| 5.0% KLD | -0.000008 |
| 1.0% KLD | -0.000023 |
| Minimum KLD | -0.000059 |
| Mean Δp | -0.0000745 ± 0.0003952 % |
| Maximum Δp | 4.186% |
| 99.9% Δp | 1.049% |
| 99.0% Δp | 0.439% |
| 95.0% Δp | 0.207% |
| 90.0% Δp | 0.125% |
| 75.0% Δp | 0.029% |
| Median Δp | 0.000% |
| 25.0% Δp | -0.030% |
| 10.0% Δp | -0.126% |
| 5.0% Δp | -0.207% |
| 1.0% Δp | -0.434% |
| 0.1% Δp | -1.016% |
| Minimum Δp | -4.672% |
| RMS Δp | 0.150 ± 0.001 % |
| Same top p | 99.739 ± 0.013 % |
## Old Numbers
<details>
<summary>Llama 2 70B Scoreboard</summary>
## Llama 2 70B Scorechart
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | | Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|--------------|------------------|------------|---------------| |--------------|------------------|------------|---------------|
| Q4_0 | 36.20 | 3.5550 | 3.61% | | Q4_0 | 36.20 | 3.5550 | 3.61% |
@ -18,3 +189,5 @@ TODO
| Q5_K_M | 45.41 | 3.4451 | 0.40% | | Q5_K_M | 45.41 | 3.4451 | 0.40% |
| Q6_K | 52.70 | 3.4367 | 0.16% | | Q6_K | 52.70 | 3.4367 | 0.16% |
| fp16 | 128.5 | 3.4313 | - | | fp16 | 128.5 | 3.4313 | - |
</details>

View file

@ -44,9 +44,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -64,7 +64,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -72,9 +72,9 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_vector_float_yaml(logfile, "logits", results.logits); yaml_dump_vector_float(logfile, "logits", results.logits);
fprintf(logfile, "ppl_value: %f\n", results.ppl_value); fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
dump_vector_float_yaml(logfile, "probs", results.probs); yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
} }
struct kl_divergence_result { struct kl_divergence_result {
double sum_nll = 0; double sum_nll = 0.0;
double sum_nll2 = 0; double sum_nll2 = 0.0;
double sum_kld = 0; double sum_nll_base = 0.0;
double sum_kld2 = 0; double sum_nll_base2 = 0.0;
double sum_nll_diff = 0; double sum_nll_nll_base = 0.0;
double sum_nll_diff2 = 0; double sum_kld = 0.0;
size_t n_same_top = 0; double sum_kld2 = 0.0;
size_t count = 0; double sum_p_diff = 0.0;
double sum_p_diff2 = 0.0;
double sum_p_diff4 = 0.0;
float max_p_diff = 0.0f;
size_t n_same_top = 0.0;
size_t count = 0.0;
}; };
static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
float max_logit = logits[0]; float max_logit = logits[0];
int imax = 0; int imax = 0;
for (int i = 1; i < n_vocab; ++i) { for (int i = 1; i < n_vocab; ++i) {
@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
const float scale = d[0]; const float scale = d[0];
const float min_log_prob = d[1]; const float min_log_prob = d[1];
base_log_prob += 4; base_log_prob += 4;
float nll = max_logit + log_sum_exp - logits[tok];
const float nll = max_logit + log_sum_exp - logits[tok];
kld.sum_nll += nll; kld.sum_nll += nll;
kld.sum_nll2 += nll*nll; kld.sum_nll2 += nll*nll;
nll += (scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff += nll; const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
kld.sum_nll_diff2 += nll*nll; kld.sum_nll_base += nll_base;
kld.sum_nll_base2 += nll_base*nll_base;
kld.sum_nll_nll_base += nll*nll_base;
max_logit += log_sum_exp; max_logit += log_sum_exp;
double sum = 0; double sum = 0;
int imax_base = -1; int imax_base = -1;
@ -269,16 +279,26 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
kld.sum_kld2 += sum*sum; kld.sum_kld2 += sum*sum;
++kld.count; ++kld.count;
if (imax == imax_base) ++kld.n_same_top; if (imax == imax_base) ++kld.n_same_top;
return sum;
const float p_base = expf(-nll_base);
const float p = expf(-nll);
const float p_diff = p - p_base;
kld.sum_p_diff += p_diff;
const double p_diff2 = p_diff*p_diff;
kld.sum_p_diff2 += p_diff2;
kld.sum_p_diff4 += p_diff2*p_diff2;
kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
return std::make_pair(sum, p_diff);
} }
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld, std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
float * kld_values) { float * kld_values, float * p_diff_values) {
std::mutex mutex; std::mutex mutex;
const int nv = 2*((n_vocab + 1)/2) + 4; const int nv = 2*((n_vocab + 1)/2) + 4;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () { auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
kl_divergence_result local_kld; kl_divergence_result local_kld;
while (true) { while (true) {
std::unique_lock<std::mutex> lock(mutex); std::unique_lock<std::mutex> lock(mutex);
@ -286,17 +306,23 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
if (i >= n_token) { if (i >= n_token) {
kld.sum_nll += local_kld.sum_nll; kld.sum_nll += local_kld.sum_nll;
kld.sum_nll2 += local_kld.sum_nll2; kld.sum_nll2 += local_kld.sum_nll2;
kld.sum_nll_base += local_kld.sum_nll_base;
kld.sum_nll_base2 += local_kld.sum_nll_base2;
kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
kld.sum_kld += local_kld.sum_kld; kld.sum_kld += local_kld.sum_kld;
kld.sum_kld2 += local_kld.sum_kld2; kld.sum_kld2 += local_kld.sum_kld2;
kld.sum_nll_diff += local_kld.sum_nll_diff; kld.sum_p_diff += local_kld.sum_p_diff;
kld.sum_nll_diff2 += local_kld.sum_nll_diff2; kld.sum_p_diff2 += local_kld.sum_p_diff2;
kld.sum_p_diff4 += local_kld.sum_p_diff4;
kld.n_same_top += local_kld.n_same_top; kld.n_same_top += local_kld.n_same_top;
kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff);
kld.count += local_kld.count; kld.count += local_kld.count;
break; break;
} }
lock.unlock(); lock.unlock();
double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
kld_values[i] = (float)v; kld_values[i] = (float)v.first;
p_diff_values[i] = v.second;
} }
}; };
for (auto & w : workers) { for (auto & w : workers) {
@ -1399,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// Use all tasks // Use all tasks
tasks.resize(n_task); tasks.resize(n_task);
printf("%s: reading tasks", __func__); printf("%s: reading tasks", __func__);
int n_dot = n_task/100; int n_dot = std::max((int) n_task/100, 1);
int i = 0; int i = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i; ++i;
@ -1649,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
llama_batch_free(batch); llama_batch_free(batch);
if (n_done < 100) return; if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
float p = 1.f*n_correct/n_done; float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1)); float sigma = sqrt(p*(1-p)/(n_done-1));
@ -1712,6 +1738,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<float> logits; std::vector<float> logits;
if (num_batches > 1) { if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab); logits.reserve(n_ctx * n_vocab);
@ -1728,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.; df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
return std::make_pair(f, df); return std::make_pair(f, df);
}; };
auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
if (count < 10) {
return 0.0;
}
double var = sumab/count - (suma/count)*(sumb/count);
var /= count - 1;
return var;
};
kl_divergence_result kld; kl_divergence_result kld;
auto kld_ptr = kld_values.data(); auto kld_ptr = kld_values.data();
auto p_diff_ptr = p_diff_values.data();
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx; const int start = i * n_ctx;
@ -1785,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
} }
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n"); printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
} }
const int first = n_ctx/2; const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, log_probs_uint16, kld, kld_ptr); workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first;
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); printf("%4d", i+1);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
auto p_top = 1.*kld.n_same_top/kld.count;
auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first), auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second, const double ppl_val = exp(log_ppl.first);
p_top, d_p_top); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
printf("\n");
fflush(stdout); fflush(stdout);
@ -1813,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (kld.count < 100) return; // we do not wish to do statistics on so few values if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end()); std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end());
printf("===== KL-divergence statistics\n"); printf("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
printf("\n");
printf("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second); printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2]; : kld_values[kld_values.size()/2];
printf("Median : %10.6f\n", kld_median);
auto percentile = [&kld_values] (float fraction) { auto percentile = [] (std::vector<float> values, float fraction) {
if (fraction <= 0) return kld_values.front(); if (fraction <= 0) return values.front();
if (fraction >= 1) return kld_values.back(); if (fraction >= 1) return values.back();
float p = fraction*(kld_values.size() - 1); float p = fraction*(values.size() - 1);
size_t ip = size_t(p); p -= ip; size_t ip = size_t(p); p -= ip;
return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)]; return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
}; };
printf("Maximum: %10.6f\n", kld_values.back()); printf("Maximum KLD: %10.6f\n", kld_values.back());
printf("KLD_99 : %10.6f\n", percentile(0.99f)); printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
printf("KLD_95 : %10.6f\n", percentile(0.95f)); printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("KLD_90 : %10.6f\n", percentile(0.90f)); printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
printf("Median KLD: %10.6f\n", kld_median);
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
printf("Minimum KLD: %10.6f\n", kld_values.front());
printf("Minimum: %10.6f\n", kld_values.front()); printf("\n");
printf("KLD_01 : %10.6f\n", percentile(0.01f));
printf("KLD_05 : %10.6f\n", percentile(0.05f)); printf("====== Token probability statistics ======\n");
printf("KLD_10 : %10.6f\n", percentile(0.10f));
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2];
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count;
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
} }
@ -1887,7 +2007,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -1915,7 +2035,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
struct results_perplexity results; struct results_perplexity results;

View file

@ -23,7 +23,7 @@
#endif #endif
struct quantize_stats_params { struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.gguf"; std::string model = DEFAULT_MODEL_PATH;
bool verbose = false; bool verbose = false;
bool per_layer_stats = false; bool per_layer_stats = false;
bool print_histogram = false; bool print_histogram = false;

View file

@ -1,6 +1,8 @@
# quantize # quantize
TODO You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
Note: It is synced from llama.cpp `main` every 6 hours.
## Llama 2 7B ## Llama 2 7B

View file

@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
@ -258,7 +259,7 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) { } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@ -283,7 +284,7 @@ int main(int argc, char ** argv) {
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--keep-split")) { } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true; params.keep_split = true;
} else { } else {
usage(argv[0]); usage(argv[0]);

View file

@ -41,8 +41,8 @@ $SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/g
echo PASS echo PASS
echo echo
# 3. Requant model with '--keep_split' # 3. Requant model with '--keep-split'
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K $QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
echo PASS echo PASS
echo echo
@ -51,7 +51,7 @@ $MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt
echo PASS echo PASS
echo echo
# 4. Requant mode without '--keep_split' # 4. Requant mode without '--keep-split'
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K $QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
echo PASS echo PASS
echo echo

View file

@ -11,7 +11,7 @@ struct retrieval_params {
}; };
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
gpt_print_usage(argc, argv, gpt_params); gpt_params_print_usage(argc, argv, gpt_params);
printf("retrieval options:\n"); printf("retrieval options:\n");
printf(" --context-file FNAME file containing context to embed.\n"); printf(" --context-file FNAME file containing context to embed.\n");
printf(" specify multiple files by providing --context-file option multiple times.\n"); printf(" specify multiple files by providing --context-file option multiple times.\n");
@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// max batch size // max batch size

View file

@ -0,0 +1,2 @@
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

74
examples/rpc/README.md Normal file
View file

@ -0,0 +1,74 @@
## Overview
The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
flowchart TD
rpcb---|TCP|srva
rpcb---|TCP|srvb
rpcb-.-|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
end
subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
end
subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
end
subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
```
Each host can run a different backend, e.g. one with CUDA and another with Metal.
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
## Usage
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support:
```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
cmake --build . --config Release
```
Then, start the `rpc-server` with the backend:
```bash
$ bin/rpc-server -p 50052
create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
Starting RPC server on 0.0.0.0:50052
```
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
```bash
mkdir build-rpc
cd build-rpc
cmake .. -DLLAMA_RPC=ON
cmake --build . --config Release
```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

134
examples/rpc/rpc-server.cpp Normal file
View file

@ -0,0 +1,134 @@
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <unistd.h>
#endif
#include <string>
#include <stdio.h>
struct rpc_server_params {
std::string host = "0.0.0.0";
int port = 50052;
size_t backend_mem = 0;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-H" || arg == "--host") {
if (++i >= argc) {
return false;
}
params.host = argv[i];
} else if (arg == "-p" || arg == "--port") {
if (++i >= argc) {
return false;
}
params.port = std::stoi(argv[i]);
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
}
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#elif GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
backend = ggml_backend_metal_init();
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
fprintf(stderr, "%s: using CPU backend\n", __func__);
backend = ggml_backend_cpu_init();
}
return backend;
}
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else
#ifdef _WIN32
MEMORYSTATUSEX status;
status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
*total_mem = status.ullTotalPhys;
*free_mem = status.ullAvailPhys;
#else
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
*total_mem = pages * page_size;
*free_mem = *total_mem;
#endif
#endif
}
int main(int argc, char * argv[]) {
rpc_server_params params;
if (!rpc_server_params_parse(argc, argv, params)) {
fprintf(stderr, "Invalid parameters\n");
return 1;
}
ggml_backend_t backend = create_backend();
if (!backend) {
fprintf(stderr, "Failed to create backend\n");
return 1;
}
std::string endpoint = params.host + ":" + std::to_string(params.port);
size_t free_mem, total_mem;
if (params.backend_mem > 0) {
free_mem = params.backend_mem;
total_mem = params.backend_mem;
} else {
get_backend_memory(&free_mem, &total_mem);
}
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View file

@ -17,8 +17,9 @@ The project is under active development, and we are [looking for feedback and co
**Command line options:** **Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend. - `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes - `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system - `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems. - `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -48,8 +47,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--path`: Path from which to serve static files. Default: disabled - `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1` - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
@ -62,6 +61,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled - `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` - `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
**If compiled with `LLAMA_SERVER_SSL=ON`** **If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@ -74,15 +85,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- Using `make`: - Using `make`:
```bash ```bash
make make server
``` ```
- Using `CMake`: - Using `CMake`:
```bash ```bash
cmake --build . --config Release cmake -B build
cmake --build build --config Release -t server
``` ```
Binary is at `./build/bin/server`
## Build with SSL ## Build with SSL
`server` can also be built with SSL support using OpenSSL 3 `server` can also be built with SSL support using OpenSSL 3
@ -99,10 +113,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- Using `CMake`: - Using `CMake`:
```bash ```bash
mkdir build cmake -B build -DLLAMA_SERVER_SSL=ON
cd build cmake --build build --config Release -t server
cmake .. -DLLAMA_SERVER_SSL=ON
make server
``` ```
## Quick Start ## Quick Start
@ -259,7 +271,7 @@ node index.js
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]` `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0` `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
@ -318,7 +330,7 @@ Notice that each `probs` is an array of length `n_probs`.
`content`: Set the text to tokenize. `content`: Set the text to tokenize.
Note that a special `BOS` token is never inserted. `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
- **POST** `/detokenize`: Convert tokens to text. - **POST** `/detokenize`: Convert tokens to text.

View file

@ -268,6 +268,7 @@ def start_server_background(args):
server_args.extend(['--defrag-thold', "0.1"]) server_args.extend(['--defrag-thold', "0.1"])
server_args.append('--cont-batching') server_args.append('--cont-batching')
server_args.append('--metrics') server_args.append('--metrics')
server_args.append('--flash-attn')
server_args.extend(['--log-format', "text"]) server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]] args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

View file

@ -0,0 +1,52 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>SimpleChat (LlamaCPP, ...) </title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="message" content="Save Nature Save Earth" />
<meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
<meta name="author" content="by Humans for All" />
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
<script src="simplechat.js" defer></script>
<link rel="stylesheet" href="simplechat.css" />
</head>
<body>
<div class="samecolumn" id="fullbody">
<div class="sameline">
<p class="heading flex-grow" > <b> SimpleChat </b> </p>
<div class="sameline">
<label for="api-ep">Mode:</label>
<select name="api-ep" id="api-ep">
<option value="chat" selected>Chat</option>
<option value="completion">Completion</option>
</select>
</div>
</div>
<div id="sessions-div" class="sameline"></div>
<hr>
<div class="sameline">
<label for="system-in">System</label>
<input type="text" name="system" id="system-in" class="flex-grow"/>
</div>
<hr>
<div id="chat-div">
<p> Enter the system prompt above, before entering/submitting any user query.</p>
<p> Enter your text to the ai assistant below.</p>
<p> Use shift+enter for inserting enter.</p>
<p> Refresh the page to start over fresh.</p>
</div>
<hr>
<div class="sameline">
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
<button id="user-btn">submit</button>
</div>
</div>
</body>
</html>

View file

@ -0,0 +1,81 @@
# SimpleChat
by Humans for All.
## overview
This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
own system prompts.
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
enough manner, in general.
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
culling of old messages from the chat.
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
the js file as needed.
## usage
One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
frontend to configure the server over http(s) or so, then run this web frontend using something like python's
http module.
### running using examples/server
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
### running using python3's server module
first run examples/server
* bin/server -m path/model.gguf
next run this web front end in examples/server/public_simplechat
* cd ../examples/server/public_simplechat
* python3 -m http.server PORT
### using the front end
Open this simple web front end from your local browser
* http://127.0.0.1:PORT/index.html
Once inside
* Select between chat and completion mode. By default it is set to chat mode.
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
* if chat.add_system_begin is used
* you cant change the system prompt, after it is has been submitted once along with user query.
* you cant set a system prompt, after you have submitted any user query
* if chat.add_system_anytime is used
* one can change the system prompt any time during chat, by changing the contents of system prompt.
* inturn the updated/changed system prompt will be inserted into the chat session.
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
* Enter your query and either press enter or click on the submit button.
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
* Wait for the logic to communicate with the server and get the response.
* the user is not allowed to enter any fresh query during this time.
* the user input box will be disabled and a working message will be shown in it.
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
* Using NewChat one can start independent chat sessions.
* two independent chat sessions are setup by default.
## Devel note
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
may not be visible. Also remember that just refreshing/reloading page in browser or for that
matter clearing site data, dont directly override site caching in all cases. Worst case you may
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
Concept of multiple chat sessions with different servers, as well as saving and restoring of
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
its instances relatively easily, however given the current goal of keeping this simple, it has
not been added, for now.
By switching between chat.add_system_begin/anytime, one can control whether one can change
the system prompt, anytime during the conversation or only at the beginning.

View file

@ -0,0 +1,61 @@
/**
* the styling of the simplechat web frontend
* by Humans for All
*/
#fullbody {
height: 98vh;
}
.heading {
background-color: lightgray;
}
.session-selected {
background-color: lightblue;
}
.role-system {
background-color: lightblue;
}
.role-user {
background-color: lightgray;
}
.flex-grow {
flex-grow: 1;
}
.float-right {
float: right;
}
#chat-div {
overflow: scroll;
flex-grow: 1;
flex-shrink: 1;
min-height: 40vh;
}
button {
min-width: 8vw;
}
.sameline {
display: flex;
flex-direction: row;
}
.samecolumn {
display: flex;
flex-direction: column;
}
* {
margin: 0.6vmin;
}
@media print {
#fullbody {
height: auto;
}
}

View file

@ -0,0 +1,478 @@
// @ts-check
// A simple completions and chat/completions test related web front end logic
// by Humans for All
class Roles {
static System = "system";
static User = "user";
static Assistant = "assistant";
}
class ApiEP {
static Chat = "chat";
static Completion = "completion";
}
let gUsageMsg = `
<p> Enter the system prompt above, before entering/submitting any user query.</p>
<p> Enter your text to the ai assistant below.</p>
<p> Use shift+enter for inserting enter.</p>
<p> Refresh the page to start over fresh.</p>
`;
class SimpleChat {
constructor() {
/**
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
* @type {{role: string, content: string}[]}
*/
this.xchat = [];
this.iLastSys = -1;
}
/**
* Add an entry into xchat
* @param {string} role
* @param {string|undefined|null} content
*/
add(role, content) {
if ((content == undefined) || (content == null) || (content == "")) {
return false;
}
this.xchat.push( {role: role, content: content} );
if (role == Roles.System) {
this.iLastSys = this.xchat.length - 1;
}
return true;
}
/**
* Show the contents in the specified div
* @param {HTMLDivElement} div
* @param {boolean} bClear
*/
show(div, bClear=true) {
if (bClear) {
div.replaceChildren();
}
let last = undefined;
for(const x of this.xchat) {
let entry = document.createElement("p");
entry.className = `role-${x.role}`;
entry.innerText = `${x.role}: ${x.content}`;
div.appendChild(entry);
last = entry;
}
if (last !== undefined) {
last.scrollIntoView(false);
} else {
if (bClear) {
div.innerHTML = gUsageMsg;
}
}
}
/**
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
* Convert the json into string.
* @param {Object} obj
*/
request_jsonstr(obj) {
obj["temperature"] = 0.7;
return JSON.stringify(obj);
}
/**
* Return a string form of json object suitable for chat/completions
*/
request_messages_jsonstr() {
let req = {
messages: this.xchat,
}
return this.request_jsonstr(req);
}
/**
* Return a string form of json object suitable for /completions
*/
request_prompt_jsonstr() {
let prompt = "";
for(const chat of this.xchat) {
prompt += `${chat.role}: ${chat.content}\n`;
}
let req = {
prompt: prompt,
}
return this.request_jsonstr(req);
}
/**
* Allow setting of system prompt, but only at begining.
* @param {string} sysPrompt
* @param {string} msgTag
*/
add_system_begin(sysPrompt, msgTag) {
if (this.xchat.length == 0) {
if (sysPrompt.length > 0) {
return this.add(Roles.System, sysPrompt);
}
} else {
if (sysPrompt.length > 0) {
if (this.xchat[0].role !== Roles.System) {
console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
} else {
if (this.xchat[0].content !== sysPrompt) {
console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
}
}
}
}
return false;
}
/**
* Allow setting of system prompt, at any time.
* @param {string} sysPrompt
* @param {string} msgTag
*/
add_system_anytime(sysPrompt, msgTag) {
if (sysPrompt.length <= 0) {
return false;
}
if (this.iLastSys < 0) {
return this.add(Roles.System, sysPrompt);
}
let lastSys = this.xchat[this.iLastSys].content;
if (lastSys !== sysPrompt) {
return this.add(Roles.System, sysPrompt);
}
return false;
}
/**
* Retrieve the latest system prompt.
*/
get_system_latest() {
if (this.iLastSys == -1) {
return "";
}
let sysPrompt = this.xchat[this.iLastSys].content;
return sysPrompt;
}
}
let gBaseURL = "http://127.0.0.1:8080";
let gChatURL = {
'chat': `${gBaseURL}/chat/completions`,
'completion': `${gBaseURL}/completions`,
}
const gbCompletionFreshChatAlways = true;
/**
* Set the class of the children, based on whether it is the idSelected or not.
* @param {HTMLDivElement} elBase
* @param {string} idSelected
* @param {string} classSelected
* @param {string} classUnSelected
*/
function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
for(let child of elBase.children) {
if (child.id == idSelected) {
child.className = classSelected;
} else {
child.className = classUnSelected;
}
}
}
/**
* Create button and set it up.
* @param {string} id
* @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
* @param {string | undefined} name
* @param {string | undefined} innerText
*/
function el_create_button(id, callback, name=undefined, innerText=undefined) {
if (!name) {
name = id;
}
if (!innerText) {
innerText = id;
}
let btn = document.createElement("button");
btn.id = id;
btn.name = name;
btn.innerText = innerText;
btn.addEventListener("click", callback);
return btn;
}
class MultiChatUI {
constructor() {
/** @type {Object<string, SimpleChat>} */
this.simpleChats = {};
/** @type {string} */
this.curChatId = "";
// the ui elements
this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
this.validate_element(this.elInSystem, "system-in");
this.validate_element(this.elDivChat, "chat-div");
this.validate_element(this.elInUser, "user-in");
this.validate_element(this.elSelectApiEP, "api-ep");
this.validate_element(this.elDivChat, "sessions-div");
}
/**
* Check if the element got
* @param {HTMLElement | null} el
* @param {string} msgTag
*/
validate_element(el, msgTag) {
if (el == null) {
throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
} else {
console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
}
}
/**
* Reset user input ui.
* * clear user input
* * enable user input
* * set focus to user input
*/
ui_reset_userinput() {
this.elInUser.value = "";
this.elInUser.disabled = false;
this.elInUser.focus();
}
/**
* Setup the needed callbacks wrt UI, curChatId to defaultChatId and
* optionally switch to specified defaultChatId.
* @param {string} defaultChatId
* @param {boolean} bSwitchSession
*/
setup_ui(defaultChatId, bSwitchSession=false) {
this.curChatId = defaultChatId;
if (bSwitchSession) {
this.handle_session_switch(this.curChatId);
}
this.elBtnUser.addEventListener("click", (ev)=>{
if (this.elInUser.disabled) {
return;
}
this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
console.debug(msg.replace("\n", ":"));
alert(msg);
this.ui_reset_userinput();
});
});
this.elInUser.addEventListener("keyup", (ev)=> {
// allow user to insert enter into their message using shift+enter.
// while just pressing enter key will lead to submitting.
if ((ev.key === "Enter") && (!ev.shiftKey)) {
this.elBtnUser.click();
ev.preventDefault();
}
});
this.elInSystem.addEventListener("keyup", (ev)=> {
// allow user to insert enter into the system prompt using shift+enter.
// while just pressing enter key will lead to setting the system prompt.
if ((ev.key === "Enter") && (!ev.shiftKey)) {
let chat = this.simpleChats[this.curChatId];
chat.add_system_anytime(this.elInSystem.value, this.curChatId);
chat.show(this.elDivChat);
ev.preventDefault();
}
});
}
/**
* Setup a new chat session and optionally switch to it.
* @param {string} chatId
* @param {boolean} bSwitchSession
*/
new_chat_session(chatId, bSwitchSession=false) {
this.simpleChats[chatId] = new SimpleChat();
if (bSwitchSession) {
this.handle_session_switch(chatId);
}
}
/**
* Handle user query submit request, wrt specified chat session.
* @param {string} chatId
* @param {string} apiEP
*/
async handle_user_submit(chatId, apiEP) {
let chat = this.simpleChats[chatId];
chat.add_system_anytime(this.elInSystem.value, chatId);
let content = this.elInUser.value;
if (!chat.add(Roles.User, content)) {
console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
return;
}
chat.show(this.elDivChat);
let theBody;
let theUrl = gChatURL[apiEP]
if (apiEP == ApiEP.Chat) {
theBody = chat.request_messages_jsonstr();
} else {
theBody = chat.request_prompt_jsonstr();
}
this.elInUser.value = "working...";
this.elInUser.disabled = true;
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
let resp = await fetch(theUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: theBody,
});
let respBody = await resp.json();
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
let assistantMsg;
if (apiEP == ApiEP.Chat) {
assistantMsg = respBody["choices"][0]["message"]["content"];
} else {
try {
assistantMsg = respBody["choices"][0]["text"];
} catch {
assistantMsg = respBody["content"];
}
}
chat.add(Roles.Assistant, assistantMsg);
if (chatId == this.curChatId) {
chat.show(this.elDivChat);
} else {
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
}
// Purposefully clear at end rather than begin of this function
// so that one can switch from chat to completion mode and sequece
// in a completion mode with multiple user-assistant chat data
// from before to be sent/occur once.
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
chat.xchat.length = 0;
}
this.ui_reset_userinput();
}
/**
* Show buttons for NewChat and available chat sessions, in the passed elDiv.
* If elDiv is undefined/null, then use this.elDivSessions.
* Take care of highlighting the selected chat-session's btn.
* @param {HTMLDivElement | undefined} elDiv
*/
show_sessions(elDiv=undefined) {
if (!elDiv) {
elDiv = this.elDivSessions;
}
elDiv.replaceChildren();
// Btn for creating new chat session
let btnNew = el_create_button("New CHAT", (ev)=> {
if (this.elInUser.disabled) {
console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
return;
}
let chatId = `Chat${Object.keys(this.simpleChats).length}`;
let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
if (!chatIdGot) {
console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
return;
}
this.new_chat_session(chatIdGot, true);
this.create_session_btn(elDiv, chatIdGot);
el_children_config_class(elDiv, chatIdGot, "session-selected", "");
});
elDiv.appendChild(btnNew);
// Btns for existing chat sessions
let chatIds = Object.keys(this.simpleChats);
for(let cid of chatIds) {
let btn = this.create_session_btn(elDiv, cid);
if (cid == this.curChatId) {
btn.className = "session-selected";
}
}
}
create_session_btn(elDiv, cid) {
let btn = el_create_button(cid, (ev)=>{
let target = /** @type{HTMLButtonElement} */(ev.target);
console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
if (this.elInUser.disabled) {
console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
return;
}
this.handle_session_switch(target.id);
el_children_config_class(elDiv, target.id, "session-selected", "");
});
elDiv.appendChild(btn);
return btn;
}
/**
* Switch ui to the specified chatId and set curChatId to same.
* @param {string} chatId
*/
async handle_session_switch(chatId) {
let chat = this.simpleChats[chatId];
if (chat == undefined) {
console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
return;
}
this.elInSystem.value = chat.get_system_latest();
this.elInUser.value = "";
chat.show(this.elDivChat);
this.elInUser.focus();
this.curChatId = chatId;
console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
}
}
let gMuitChat;
const gChatIds = [ "Default", "Other" ];
function startme() {
console.log("INFO:SimpleChat:StartMe:Starting...");
gMuitChat = new MultiChatUI();
for (let cid of gChatIds) {
gMuitChat.new_chat_session(cid);
}
gMuitChat.setup_ui(gChatIds[0]);
gMuitChat.show_sessions();
}
document.addEventListener("DOMContentLoaded", startme);

View file

@ -12,6 +12,8 @@
// increase max payload length to allow use of larger context size // increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h" #include "httplib.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp" #include "json.hpp"
// auto generated files (update with ./deps.sh) // auto generated files (update with ./deps.sh)
@ -100,7 +102,6 @@ struct slot_params {
bool stream = true; bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
@ -649,9 +650,6 @@ struct server_context {
std::string system_prompt; std::string system_prompt;
std::vector<llama_token> system_tokens; std::vector<llama_token> system_tokens;
std::string name_user; // this should be the antiprompt
std::string name_assistant;
// slots / clients // slots / clients
std::vector<server_slot> slots; std::vector<server_slot> slots;
json default_generation_settings_for_props; json default_generation_settings_for_props;
@ -671,6 +669,15 @@ struct server_context {
llama_free_model(model); llama_free_model(model);
model = nullptr; model = nullptr;
} }
// Clear any sampling context
for (server_slot & slot : slots) {
if (slot.ctx_sampling != nullptr) {
llama_sampling_free(slot.ctx_sampling);
}
}
llama_batch_free(batch);
} }
bool load_model(const gpt_params & params_) { bool load_model(const gpt_params & params_) {
@ -859,7 +866,7 @@ struct server_context {
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
// process "json_schema" and "grammar" // process "json_schema" and "grammar"
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) { if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} else if (data.contains("json_schema") && !data.contains("grammar")) { } else if (data.contains("json_schema") && !data.contains("grammar")) {
@ -1012,7 +1019,7 @@ struct server_context {
sampler_names.emplace_back(sampler_name); sampler_names.emplace_back(sampler_name);
} }
} }
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
} else { } else {
slot.sparams.samplers_sequence = default_sparams.samplers_sequence; slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
} }
@ -1096,15 +1103,11 @@ struct server_context {
system_need_update = false; system_need_update = false;
} }
void system_prompt_set(const json & sys_props) { bool system_prompt_set(const std::string & sys_prompt) {
system_prompt = sys_props.value("prompt", ""); system_prompt = sys_prompt;
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");
LOG_VERBOSE("system prompt process", { LOG_VERBOSE("system prompt process", {
{"system_prompt", system_prompt}, {"system_prompt", system_prompt},
{"name_user", name_user},
{"name_assistant", name_assistant},
}); });
// release all slots // release all slots
@ -1113,6 +1116,7 @@ struct server_context {
} }
system_need_update = true; system_need_update = true;
return true;
} }
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
@ -1252,14 +1256,14 @@ struct server_context {
std::vector<std::string> samplers_sequence; std::vector<std::string> samplers_sequence;
samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
for (const auto & sampler_type : slot.sparams.samplers_sequence) { for (const auto & sampler_type : slot.sparams.samplers_sequence) {
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
} }
return json { return json {
{"n_ctx", slot.n_ctx}, {"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict}, {"n_predict", slot.n_predict},
{"model", params.model_alias}, {"model", params.model_alias},
{"seed", slot.params.seed}, {"seed", slot.sparams.seed},
{"temperature", slot.sparams.temp}, {"temperature", slot.sparams.temp},
{"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_range", slot.sparams.dynatemp_range},
{"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
@ -1383,9 +1387,10 @@ struct server_context {
if (!slot.params.stream && slot.stopped_word) { if (!slot.params.stream && slot.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
probs = std::vector<completion_token_output>( probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(), slot.generated_token_probs.begin(),
slot.generated_token_probs.end() - stop_word_toks.size()); slot.generated_token_probs.end() - safe_offset);
} else { } else {
probs = std::vector<completion_token_output>( probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(), slot.generated_token_probs.begin(),
@ -1511,7 +1516,7 @@ struct server_context {
// add subtasks // add subtasks
for (int i = 0; i < prompt_count; i++) { for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data; json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i]; subtask_data["prompt"] = subtask_data.at("prompt")[i];
// subtasks inherit everything else (infill mode, embedding mode, etc.) // subtasks inherit everything else (infill mode, embedding mode, etc.)
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding); request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@ -1531,7 +1536,8 @@ struct server_context {
} }
if (task.data.contains("system_prompt")) { if (task.data.contains("system_prompt")) {
system_prompt_set(task.data["system_prompt"]); std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
system_prompt_set(sys_prompt);
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
slot.n_past = 0; slot.n_past = 0;
@ -1643,7 +1649,7 @@ struct server_context {
} break; } break;
case SERVER_TASK_TYPE_SLOT_SAVE: case SERVER_TASK_TYPE_SLOT_SAVE:
{ {
int id_slot = task.data["id_slot"]; int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1653,8 +1659,8 @@ struct server_context {
const size_t token_count = slot->cache_tokens.size(); const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"]; std::string filename = task.data.at("filename");
std::string filepath = task.data["filepath"]; std::string filepath = task.data.at("filepath");
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
@ -1678,7 +1684,7 @@ struct server_context {
} break; } break;
case SERVER_TASK_TYPE_SLOT_RESTORE: case SERVER_TASK_TYPE_SLOT_RESTORE:
{ {
int id_slot = task.data["id_slot"]; int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1687,8 +1693,8 @@ struct server_context {
const int64_t t_start = ggml_time_us(); const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"]; std::string filename = task.data.at("filename");
std::string filepath = task.data["filepath"]; std::string filepath = task.data.at("filepath");
slot->cache_tokens.resize(slot->n_ctx); slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0; size_t token_count = 0;
@ -1720,7 +1726,7 @@ struct server_context {
} break; } break;
case SERVER_TASK_TYPE_SLOT_ERASE: case SERVER_TASK_TYPE_SLOT_ERASE:
{ {
int id_slot = task.data["id_slot"]; int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot); server_slot * slot = get_slot(id_slot);
if (slot == nullptr) { if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1975,8 +1981,7 @@ struct server_context {
slot.state = SLOT_STATE_PROCESSING; slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE; slot.command = SLOT_COMMAND_NONE;
slot.release(); slot.release();
slot.print_timings(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
send_final_response(slot);
continue; continue;
} }
} else { } else {
@ -2265,18 +2270,32 @@ struct server_context {
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
result.tok = id; result.tok = id;
const int32_t n_probs = slot.sparams.n_probs; const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
if (slot.sparams.temp <= 0 && n_probs > 0) { if (n_probs > 0) {
// for llama_sample_token_greedy we need to sort candidates const size_t n_valid = slot.ctx_sampling->n_valid;
llama_sample_softmax(ctx, &cur_p);
// Make sure at least n_probs top tokens are at the front of the vector:
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
} }
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) { if (slot.sparams.temp == 0.0f) {
// With greedy sampling the probabilities have possibly not been calculated.
for (size_t i = 0; i < n_probs; ++i) {
result.probs.push_back({ result.probs.push_back({
cur_p.data[i].id, cur_p.data[i].id,
cur_p.data[i].p i == 0 ? 1.0f : 0.0f
}); });
} }
} else {
for (size_t i = 0; i < n_probs; ++i) {
result.probs.push_back({
cur_p.data[i].id,
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
});
}
}
}
if (!process_token(result, slot)) { if (!process_token(result, slot)) {
slot.release(); slot.release();
@ -2353,7 +2372,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" disable KV offload\n"); printf(" disable KV offload\n");
} }
printf(" -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str()); printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
printf(" model download url (default: unused)\n"); printf(" model download url (default: unused)\n");
printf(" -hfr REPO, --hf-repo REPO\n"); printf(" -hfr REPO, --hf-repo REPO\n");
@ -2366,6 +2385,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
@ -2377,6 +2397,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" -ctk TYPE, --cache-type-k TYPE\n"); printf(" -ctk TYPE, --cache-type-k TYPE\n");
@ -2417,6 +2438,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break; break;
} }
sparams.port = std::stoi(argv[i]); sparams.port = std::stoi(argv[i]);
} else if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers = argv[i];
} else if (arg == "--host") { } else if (arg == "--host") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -2742,6 +2769,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
params.embedding = true; params.embedding = true;
} else if (arg == "-cb" || arg == "--cont-batching") { } else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true; params.cont_batching = true;
} else if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
} else if (arg == "-np" || arg == "--parallel") { } else if (arg == "-np" || arg == "--parallel") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -2823,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
invalid_param = true; invalid_param = true;
break; break;
} }
if (!parse_kv_override(argv[i], params.kv_overrides)) { if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true; invalid_param = true;
break; break;
@ -2835,6 +2864,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
} }
} }
gpt_params_handle_model_default(params);
if (!params.kv_overrides.empty()) { if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back(); params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0; params.kv_overrides.back().key[0] = 0;
@ -2896,7 +2927,7 @@ int main(int argc, char ** argv) {
server_params_parse(argc, argv, sparams, params); server_params_parse(argc, argv, sparams, params);
if (!sparams.system_prompt.empty()) { if (!sparams.system_prompt.empty()) {
ctx_server.system_prompt_set(json::parse(sparams.system_prompt)); ctx_server.system_prompt_set(sparams.system_prompt);
} }
if (params.model_alias == "unknown") { if (params.model_alias == "unknown") {
@ -3116,8 +3147,8 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id); server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id);
const int n_idle_slots = result.data["idle"]; const int n_idle_slots = result.data.at("idle");
const int n_processing_slots = result.data["processing"]; const int n_processing_slots = result.data.at("processing");
json health = { json health = {
{"status", "ok"}, {"status", "ok"},
@ -3127,7 +3158,7 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK res.status = 200; // HTTP OK
if (sparams.slots_endpoint && req.has_param("include_slots")) { if (sparams.slots_endpoint && req.has_param("include_slots")) {
health["slots"] = result.data["slots"]; health["slots"] = result.data.at("slots");
} }
if (n_idle_slots == 0) { if (n_idle_slots == 0) {
@ -3171,7 +3202,7 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id); server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id);
res.set_content(result.data["slots"].dump(), "application/json"); res.set_content(result.data.at("slots").dump(), "application/json");
res.status = 200; // HTTP OK res.status = 200; // HTTP OK
}; };
@ -3198,32 +3229,32 @@ int main(int argc, char ** argv) {
json data = result.data; json data = result.data;
const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
const uint64_t t_prompt_processing = data["t_prompt_processing"]; const uint64_t t_prompt_processing = data.at("t_prompt_processing");
const uint64_t n_tokens_predicted = data["n_tokens_predicted"]; const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
const uint64_t t_tokens_generation = data["t_tokens_generation"]; const uint64_t t_tokens_generation = data.at("t_tokens_generation");
const int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
json all_metrics_def = json { json all_metrics_def = json {
{"counter", {{ {"counter", {{
{"name", "prompt_tokens_total"}, {"name", "prompt_tokens_total"},
{"help", "Number of prompt tokens processed."}, {"help", "Number of prompt tokens processed."},
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]} {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
}, { }, {
{"name", "prompt_seconds_total"}, {"name", "prompt_seconds_total"},
{"help", "Prompt process time"}, {"help", "Prompt process time"},
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3} {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
}, { }, {
{"name", "tokens_predicted_total"}, {"name", "tokens_predicted_total"},
{"help", "Number of generation tokens processed."}, {"help", "Number of generation tokens processed."},
{"value", (uint64_t) data["n_tokens_predicted_total"]} {"value", (uint64_t) data.at("n_tokens_predicted_total")}
}, { }, {
{"name", "tokens_predicted_seconds_total"}, {"name", "tokens_predicted_seconds_total"},
{"help", "Predict process time"}, {"help", "Predict process time"},
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3} {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
}}}, }}},
{"gauge", {{ {"gauge", {{
{"name", "prompt_tokens_seconds"}, {"name", "prompt_tokens_seconds"},
@ -3240,15 +3271,15 @@ int main(int argc, char ** argv) {
},{ },{
{"name", "kv_cache_tokens"}, {"name", "kv_cache_tokens"},
{"help", "KV-cache tokens."}, {"help", "KV-cache tokens."},
{"value", (uint64_t) data["kv_cache_tokens_count"]} {"value", (uint64_t) data.at("kv_cache_tokens_count")}
},{ },{
{"name", "requests_processing"}, {"name", "requests_processing"},
{"help", "Number of request processing."}, {"help", "Number of request processing."},
{"value", (uint64_t) data["processing"]} {"value", (uint64_t) data.at("processing")}
},{ },{
{"name", "requests_deferred"}, {"name", "requests_deferred"},
{"help", "Number of request deferred."}, {"help", "Number of request deferred."},
{"value", (uint64_t) data["deferred"]} {"value", (uint64_t) data.at("deferred")}
}}} }}}
}; };
@ -3259,8 +3290,8 @@ int main(int argc, char ** argv) {
const auto & metrics_def = el.value(); const auto & metrics_def = el.value();
for (const auto & metric_def : metrics_def) { for (const auto & metric_def : metrics_def) {
const std::string name = metric_def["name"]; const std::string name = metric_def.at("name");
const std::string help = metric_def["help"]; const std::string help = metric_def.at("help");
auto value = json_value(metric_def, "value", 0.); auto value = json_value(metric_def, "value", 0.);
prometheus << "# HELP llamacpp:" << name << " " << help << "\n" prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
@ -3269,7 +3300,7 @@ int main(int argc, char ** argv) {
} }
} }
const int64_t t_start = data["t_start"]; const int64_t t_start = data.at("t_start");
res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@ -3278,8 +3309,8 @@ int main(int argc, char ** argv) {
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body); json request_data = json::parse(req.body);
std::string filename = request_data["filename"]; std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) { if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
@ -3308,8 +3339,8 @@ int main(int argc, char ** argv) {
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) { const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body); json request_data = json::parse(req.body);
std::string filename = request_data["filename"]; std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) { if (!fs_validate_filename(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
@ -3385,8 +3416,7 @@ int main(int argc, char ** argv) {
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = { json data = {
{ "user_name", ctx_server.name_user.c_str() }, { "system_prompt", ctx_server.system_prompt.c_str() },
{ "assistant_name", ctx_server.name_assistant.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel } { "total_slots", ctx_server.params.n_parallel }
}; };
@ -3627,7 +3657,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
if (body.count("content") != 0) { if (body.count("content") != 0) {
tokens = ctx_server.tokenize(body["content"], false); const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
} }
const json data = format_tokenizer_response(tokens); const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json; charset=utf-8"); return res.set_content(data.dump(), "application/json; charset=utf-8");
@ -3639,7 +3670,7 @@ int main(int argc, char ** argv) {
std::string content; std::string content;
if (body.count("tokens") != 0) { if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"]; const std::vector<llama_token> tokens = body.at("tokens");
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
} }
@ -3662,10 +3693,10 @@ int main(int argc, char ** argv) {
json prompt; json prompt;
if (body.count("input") != 0) { if (body.count("input") != 0) {
is_openai = true; is_openai = true;
prompt = body["input"]; prompt = body.at("input");
} else if (body.count("content") != 0) { } else if (body.count("content") != 0) {
// with "content", we only support single prompt // with "content", we only support single prompt
prompt = std::vector<std::string>{body["content"]}; prompt = std::vector<std::string>{body.at("content")};
} else { } else {
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
return; return;
@ -3684,7 +3715,7 @@ int main(int argc, char ** argv) {
if (!result.error) { if (!result.error) {
if (result.data.count("results")) { if (result.data.count("results")) {
// result for multi-task // result for multi-task
responses = result.data["results"]; responses = result.data.at("results");
} else { } else {
// result for single task // result for single task
responses = std::vector<json>{result.data}; responses = std::vector<json>{result.data};

View file

@ -5,7 +5,7 @@ Feature: llama.cpp server
Background: Server startup Background: Server startup
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
And a model file ggml-model-f16.gguf And a model file bert-bge-small.gguf
And a model alias bert-bge-small And a model alias bert-bge-small
And 42 as server seed And 42 as server seed
And 2 slots And 2 slots

View file

@ -7,51 +7,112 @@ Feature: Results
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf And a model file test-model-00001-of-00003.gguf
And 128 as batch size And 128 as batch size
And 256 KV cache size And 1024 KV cache size
And 128 max tokens to predict And 128 max tokens to predict
Scenario Outline: Multi users completion
Given <n_slots> slots
And continuous batching And continuous batching
Scenario Outline: consistent results with same seed
Given <n_slots> slots
And 1.0 temperature
Then the server is starting Then the server is starting
Then the server is healthy Then the server is healthy
Given 42 as seed Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given concurrent completion requests Given concurrent completion requests
Then the server is busy Then the server is busy
Then the server is idle Then the server is idle
And all slots are idle And all slots are idle
Then all predictions are equal Then all predictions are equal
Examples:
| n_slots |
| 1 |
# FIXME: unified KV cache nondeterminism
# | 2 |
Scenario Outline: different results with different seed
Given <n_slots> slots
And 1.0 temperature
Then the server is starting
Then the server is healthy
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are different
Examples: Examples:
| n_slots | | n_slots |
| 1 | | 1 |
| 2 | | 2 |
Scenario Outline: consistent results with same seed and varying batch size
Given 4 slots
And <temp> temperature
# And 0 as draft
Then the server is starting
Then the server is healthy
Given 1 prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_parallel | temp |
| 1 | 0.0 |
| 1 | 1.0 |
# FIXME: unified KV cache nondeterminism
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 2 | 0.0 |
# | 4 | 0.0 |
# | 2 | 1.0 |
# | 4 | 1.0 |
Scenario Outline: consistent token probs with same seed and prompt
Given <n_slots> slots
And <n_kv> KV cache size
And 1.0 temperature
And <n_predict> max tokens to predict
Then the server is starting
Then the server is healthy
Given 1 prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "The meaning of life is" with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all token probabilities are equal
Examples:
| n_slots | n_kv | n_predict | n_parallel |
| 4 | 1024 | 1 | 1 |
# FIXME: unified KV cache nondeterminism
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
# | 4 | 1024 | 1 | 4 |
# | 4 | 1024 | 100 | 1 |
# This test still fails even the above patches; the first token probabilities are already different.
# | 4 | 1024 | 100 | 4 |

View file

@ -7,6 +7,7 @@ Feature: llama.cpp server
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf And a model file test-model.gguf
And a model alias tinyllama-2 And a model alias tinyllama-2
And BOS token is 1
And 42 as server seed And 42 as server seed
# KV Cache corresponds to the total amount of tokens # KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130 # that can be stored across all independent sequences: #4130
@ -91,7 +92,18 @@ Feature: llama.cpp server
""" """
What is the capital of France ? What is the capital of France ?
""" """
Then tokens can be detokenize Then tokens can be detokenized
And tokens do not begin with BOS
Scenario: Tokenize w/ BOS
Given adding special tokens
When tokenizing:
"""
What is the capital of Germany?
"""
Then tokens begin with BOS
Given first token is removed
Then tokens can be detokenized
Scenario: Models available Scenario: Models available
Given available models Given available models

View file

@ -23,6 +23,7 @@ from prometheus_client import parser
def step_server_config(context, server_fqdn, server_port): def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
context.server_port = int(server_port) context.server_port = int(server_port)
context.n_threads = None
context.n_gpu_layer = None context.n_gpu_layer = None
if 'PORT' in os.environ: if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT']) context.server_port = int(os.environ['PORT'])
@ -65,6 +66,7 @@ def step_server_config(context, server_fqdn, server_port):
context.server_seed = None context.server_seed = None
context.user_api_key = None context.user_api_key = None
context.response_format = None context.response_format = None
context.temperature = None
context.tasks_result = [] context.tasks_result = []
context.concurrent_tasks = [] context.concurrent_tasks = []
@ -108,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl context.n_gpu_layer = ngl
@step('{n_threads:d} threads')
def step_n_threads(context, n_threads):
context.n_thread = n_threads
@step('{draft:d} as draft') @step('{draft:d} as draft')
def step_draft(context, draft): def step_draft(context, draft):
context.draft = draft context.draft = draft
@ -192,7 +199,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
case 'ready' | 'idle': case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok', await wait_for_health_status(context, context.base_url, 200, 'ok',
timeout=10, timeout=30,
params={'fail_on_no_slot': 0, 'include_slots': 0}, params={'fail_on_no_slot': 0, 'include_slots': 0},
slots_idle=context.n_slots, slots_idle=context.n_slots,
slots_processing=0, slots_processing=0,
@ -232,15 +239,17 @@ async def step_all_slots_status(context, expected_slot_status_string):
@async_run_until_complete @async_run_until_complete
async def step_request_completion(context, api_error): async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1)
completion = await request_completion(context.prompts.pop(), completion = await request_completion(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.base_url, context.base_url,
debug=context.debug, debug=context.debug,
n_predict=context.n_predict, n_predict=context.n_predict,
cache_prompt=context.cache_prompt, cache_prompt=context.cache_prompt,
id_slot=context.id_slot, id_slot=context.id_slot,
seed=await completions_seed(context),
expect_api_error=expect_api_error, expect_api_error=expect_api_error,
user_api_key=context.user_api_key) user_api_key=context.user_api_key,
temperature=context.temperature)
context.tasks_result.append(completion) context.tasks_result.append(completion)
if context.debug: if context.debug:
print(f"Completion response: {completion}") print(f"Completion response: {completion}")
@ -269,6 +278,24 @@ async def step_predictions_equal(context):
context.tasks_result = [] context.tasks_result = []
@step('all predictions are different')
@async_run_until_complete
async def step_predictions_different(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_different(context.tasks_result)
context.tasks_result = []
@step('all token probabilities are equal')
@async_run_until_complete
async def step_token_probabilities_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_token_probabilities_equal(context.tasks_result)
context.tasks_result = []
@step('the completion is truncated') @step('the completion is truncated')
def step_assert_completion_truncated(context): def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '') step_assert_completion_truncated(context, '')
@ -311,6 +338,11 @@ def step_response_format(context, response_format):
context.response_format = json.loads(response_format) context.response_format = json.loads(response_format)
@step('{temperature:f} temperature')
def step_temperature(context, temperature):
context.temperature = temperature
@step('streaming is {enable_streaming}') @step('streaming is {enable_streaming}')
def step_streaming(context, enable_streaming): def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled' context.enable_streaming = enable_streaming == 'enabled'
@ -353,7 +385,15 @@ def step_n_ubatch(context, n_ubatch):
@step('{seed:d} as seed') @step('{seed:d} as seed')
def step_seed(context, seed): def step_seed(context, seed):
context.seed = seed if context.seed is None:
context.seed = [seed]
else:
context.seed.append(seed)
@step('BOS token is {bos:d}')
def step_bos_token(context, bos):
context.bos = bos
@step('a prefix prompt') @step('a prefix prompt')
@ -413,7 +453,9 @@ async def step_oai_chat_completions(context, api_error):
if context.debug: if context.debug:
print(f"Submitting OAI compatible completions request...") print(f"Submitting OAI compatible completions request...")
expect_api_error = api_error == 'raised' expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1),
completion = await oai_chat_completions(context.prompts.pop(), completion = await oai_chat_completions(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.system_prompt, context.system_prompt,
context.base_url, context.base_url,
'/v1/chat', '/v1/chat',
@ -429,8 +471,6 @@ async def step_oai_chat_completions(context, api_error):
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None, if hasattr(context, 'user_api_key') else None,
@ -457,10 +497,21 @@ def step_a_prompt_prompt(context, prompt):
context.n_prompts = len(context.prompts) context.n_prompts = len(context.prompts)
@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
def step_many_prompts(context, num_prompts, prompt, seed):
if context.seed is None:
context.seed = []
for _ in range(num_prompts):
context.seed.append(seed)
context.prompts.append(prompt)
context.n_prompts = len(context.prompts)
@step('concurrent completion requests') @step('concurrent completion requests')
@async_run_until_complete() @async_run_until_complete()
async def step_concurrent_completion_requests(context): async def step_concurrent_completion_requests(context):
await concurrent_requests(context, await concurrent_requests(
context,
request_completion, request_completion,
# prompt is inserted automatically # prompt is inserted automatically
context.base_url, context.base_url,
@ -468,9 +519,9 @@ async def step_concurrent_completion_requests(context):
prompt_prefix=context.prompt_prefix, prompt_prefix=context.prompt_prefix,
prompt_suffix=context.prompt_suffix, prompt_suffix=context.prompt_suffix,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None, n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
user_api_key=context.user_api_key if hasattr(context, temperature=context.temperature,
'user_api_key') else None) )
@step('concurrent OAI completions requests') @step('concurrent OAI completions requests')
@ -490,7 +541,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -512,10 +562,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None, if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format response_format=context.response_format
if hasattr(context, 'response_format') else None, if hasattr(context, 'response_format') else None,
seed=context.seed
if hasattr(context, 'seed') else
context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None) if hasattr(context, 'user_api_key') else None)
@ -544,7 +590,7 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@async_run_until_complete @async_run_until_complete
async def step_compute_embedding(context): async def step_compute_embedding(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_embedding(context_text(context), base_url=context.base_url) context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
@step('all embeddings are the same') @step('all embeddings are the same')
@ -585,7 +631,7 @@ def step_assert_embeddings(context):
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings(context): async def step_oai_compute_embeddings(context):
context.n_prompts = 1 context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context_text(context), context.embeddings = await request_oai_embeddings(context_text(context), None,
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
@ -594,7 +640,7 @@ async def step_oai_compute_embeddings(context):
@step('an OAI compatible embeddings computation request for multiple inputs') @step('an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete @async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context): async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts, context.embeddings = await request_oai_embeddings(context.prompts, None,
base_url=context.base_url, base_url=context.base_url,
user_api_key=context.user_api_key, user_api_key=context.user_api_key,
model=context.model) model=context.model)
@ -630,21 +676,29 @@ async def all_embeddings_are_generated(context):
assert_embeddings(context.tasks_result.pop().pop()) assert_embeddings(context.tasks_result.pop().pop())
@step('adding special tokens')
def step_tokenize_set_add_special(context):
context.tokenize_add_special = True
@step('tokenizing') @step('tokenizing')
@async_run_until_complete @async_run_until_complete
async def step_tokenize(context): async def step_tokenize(context):
context.tokenized_text = context_text(context) context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/tokenize', tokenize_args = {
json={
"content": context.tokenized_text, "content": context.tokenized_text,
}) as response: }
if getattr(context, 'tokenize_add_special', None) is not None:
tokenize_args['add_special'] = context.tokenize_add_special
async with session.post(f'{context.base_url}/tokenize',
json=tokenize_args) as response:
assert response.status == 200 assert response.status == 200
tokenize_json = await response.json() tokenize_json = await response.json()
context.tokens = tokenize_json['tokens'] context.tokens = tokenize_json['tokens']
@step('tokens can be detokenize') @step('tokens can be detokenized')
@async_run_until_complete @async_run_until_complete
async def step_detokenize(context): async def step_detokenize(context):
assert len(context.tokens) > 0 assert len(context.tokens) > 0
@ -659,6 +713,21 @@ async def step_detokenize(context):
assert context.tokenized_text == detokenize_json['content'].strip() assert context.tokenized_text == detokenize_json['content'].strip()
@step('tokens begin with BOS')
def step_strings_for_tokenization(context):
assert context.tokens[0] == context.bos
@step('tokens do not begin with BOS')
def step_strings_for_tokenization(context):
assert context.tokens[0] != context.bos
@step('first token is removed')
def step_strings_for_tokenization(context):
context.tokens = context.tokens[1:]
@step('an OPTIONS request is sent from {origin}') @step('an OPTIONS request is sent from {origin}')
@async_run_until_complete @async_run_until_complete
async def step_options_request(context, origin): async def step_options_request(context, origin):
@ -740,8 +809,9 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
if context.debug: if context.debug:
print(f"starting {context.n_prompts} concurrent completion requests...") print(f"starting {context.n_prompts} concurrent completion requests...")
assert context.n_prompts > 0 assert context.n_prompts > 0
seeds = await completions_seed(context)
for prompt_no in range(context.n_prompts): for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args] shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
@ -781,6 +851,7 @@ def step_server_responds_with_status_code(context, status_code):
async def request_completion(prompt, async def request_completion(prompt,
seed,
base_url, base_url,
debug=False, debug=False,
prompt_prefix=None, prompt_prefix=None,
@ -788,9 +859,9 @@ async def request_completion(prompt,
n_predict=None, n_predict=None,
cache_prompt=False, cache_prompt=False,
id_slot=None, id_slot=None,
seed=None,
expect_api_error=None, expect_api_error=None,
user_api_key=None): user_api_key=None,
temperature=None):
if debug: if debug:
print(f"Sending completion request: {prompt}") print(f"Sending completion request: {prompt}")
origin = "my.super.domain" origin = "my.super.domain"
@ -811,7 +882,9 @@ async def request_completion(prompt,
"n_predict": n_predict if n_predict is not None else -1, "n_predict": n_predict if n_predict is not None else -1,
"cache_prompt": cache_prompt, "cache_prompt": cache_prompt,
"id_slot": id_slot, "id_slot": id_slot,
"seed": seed if seed is not None else 42 "seed": seed if seed is not None else 42,
"temperature": temperature if temperature is not None else 0.8,
"n_probs": 2,
}, },
headers=headers, headers=headers,
timeout=3600) as response: timeout=3600) as response:
@ -824,16 +897,17 @@ async def request_completion(prompt,
async def oai_chat_completions(user_prompt, async def oai_chat_completions(user_prompt,
seed,
system_prompt, system_prompt,
base_url, base_url,
base_path, base_path,
async_client, async_client,
debug=False, debug=False,
temperature=None,
model=None, model=None,
n_predict=None, n_predict=None,
enable_streaming=None, enable_streaming=None,
response_format=None, response_format=None,
seed=None,
user_api_key=None, user_api_key=None,
expect_api_error=None): expect_api_error=None):
if debug: if debug:
@ -856,7 +930,8 @@ async def oai_chat_completions(user_prompt,
"model": model, "model": model,
"max_tokens": n_predict, "max_tokens": n_predict,
"stream": enable_streaming, "stream": enable_streaming,
"seed": seed "temperature": temperature if temperature is not None else 0.0,
"seed": seed,
} }
if response_format is not None: if response_format is not None:
payload['response_format'] = response_format payload['response_format'] = response_format
@ -882,7 +957,7 @@ async def oai_chat_completions(user_prompt,
while event_received: while event_received:
event_received = False event_received = False
async for line_in_bytes in response.content: async for line_in_bytes in response.content:
line = line_in_bytes.decode('utf8') line = line_in_bytes.decode('utf-8')
line = line.rstrip('\n').rstrip('\r') line = line.rstrip('\n').rstrip('\r')
if line == '': if line == '':
continue continue
@ -921,7 +996,8 @@ async def oai_chat_completions(user_prompt,
max_tokens=n_predict, max_tokens=n_predict,
stream=enable_streaming, stream=enable_streaming,
response_format=payload.get('response_format'), response_format=payload.get('response_format'),
seed=seed seed=seed,
temperature=payload['temperature']
) )
except openai.error.AuthenticationError as e: except openai.error.AuthenticationError as e:
if expect_api_error is not None and expect_api_error: if expect_api_error is not None and expect_api_error:
@ -952,7 +1028,7 @@ async def oai_chat_completions(user_prompt,
return completion_response return completion_response
async def request_embedding(content, base_url=None): async def request_embedding(content, seed, base_url=None):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/embedding', async with session.post(f'{base_url}/embedding',
json={ json={
@ -963,7 +1039,7 @@ async def request_embedding(content, base_url=None):
return [response_json['embedding']] return [response_json['embedding']]
async def request_oai_embeddings(input, async def request_oai_embeddings(input, seed,
base_url=None, user_api_key=None, base_url=None, user_api_key=None,
model=None, async_client=False): model=None, async_client=False):
# openai client always expects an api_key # openai client always expects an api_key
@ -1036,21 +1112,48 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}') f' {n_predicted} <> {expected_predicted_n}')
def assert_all_predictions_equal(completion_responses): def assert_all_predictions_equal(completion_responses):
content_0 = completion_responses[0]['content']
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content 0: {content_0}") for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i == content_j, "contents not equal"
i = 1
for response in completion_responses[1:]:
content = response['content']
def assert_all_predictions_different(completion_responses):
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content {i}: {content}") for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i != content_j, "contents not different"
assert content == content_0, "contents not equal"
i += 1 def assert_all_token_probabilities_equal(completion_responses):
n_predict = len(completion_responses[0]['completion_probabilities'])
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
for pos in range(n_predict):
for i, response_i in enumerate(completion_responses):
probs_i = response_i['completion_probabilities'][pos]['probs']
print(f"pos {pos}, probs {i}: {probs_i}")
for pos in range(n_predict):
for i, response_i in enumerate(completion_responses):
probs_i = response_i['completion_probabilities'][pos]['probs']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
probs_j = response_j['completion_probabilities'][pos]['probs']
assert probs_i == probs_j, "contents not equal"
async def gather_tasks_results(context): async def gather_tasks_results(context):
@ -1145,9 +1248,22 @@ def assert_slots_status(slots, expected_slots):
f" = {expected[key]} != {slot[key]}") f" = {expected[key]} != {slot[key]}")
async def completions_seed(context): async def completions_seed(context, num_seeds=None):
return context.seed if hasattr(context, 'seed') and context.seed is not None \ if hasattr(context, "seed") and context.seed is not None:
else context.server_seed if hasattr(context, 'server_seed') else None assert len(context.seed) == context.n_prompts
if num_seeds is None:
num_seeds = context.n_prompts
assert num_seeds <= context.n_prompts
seeds = context.seed[:num_seeds]
context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
return seeds
if hasattr(context, "server_seed") and context.server_seed is not None:
if num_seeds is None:
return [context.server_seed] * context.n_prompts
else:
return [context.server_seed] * num_seeds
return None
def context_text(context): def context_text(context):
@ -1178,6 +1294,8 @@ def start_server_background(context):
server_args.extend(['--batch-size', context.n_batch]) server_args.extend(['--batch-size', context.n_batch])
if context.n_ubatch: if context.n_ubatch:
server_args.extend(['--ubatch-size', context.n_ubatch]) server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_threads:
server_args.extend(['--threads', context.threads])
if context.n_gpu_layer: if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.draft is not None: if context.draft is not None:

View file

@ -0,0 +1,5 @@
# LLaMA.cpp Server Wild Theme
Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
![image](wild/wild.png)

View file

@ -0,0 +1,7 @@
# LLaMA.cpp Server Buttons Top Theme
Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
To use simply run server with `--path=themes/buttons_top`
![image](buttons_top.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,5 @@
# LLaMA.cpp Server Wild Theme
Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
![image](wild.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 254 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 485 KiB

Some files were not shown because too many files have changed in this diff Show more