Merge branch 'ggerganov:master' into deepseek-v2
This commit is contained in:
commit
b24c9ed551
27 changed files with 929 additions and 473 deletions
|
@ -227,20 +227,20 @@ effectiveStdenv.mkDerivation (
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
++ optionals useRocm [
|
++ optionals useRocm [
|
||||||
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||||
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||||
|
|
||||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
|
||||||
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
|
||||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
|
||||||
# Should likely use `rocmPackages.clr.gpuTargets`.
|
|
||||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
|
||||||
]
|
]
|
||||||
++ optionals useMetalKit [
|
++ optionals useMetalKit [
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Environment variables needed for ROCm
|
||||||
|
env = optionals useRocm {
|
||||||
|
ROCM_PATH = "${rocmPackages.clr}";
|
||||||
|
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
||||||
|
};
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
|
|
119
.github/workflows/build.yml
vendored
119
.github/workflows/build.yml
vendored
|
@ -392,6 +392,33 @@ jobs:
|
||||||
cmake -DLLAMA_VULKAN=ON ..
|
cmake -DLLAMA_VULKAN=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-22-cmake-hip:
|
||||||
|
runs-on: ubuntu-22.04
|
||||||
|
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
|
||||||
|
|
||||||
|
- name: Build with native CMake HIP support
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build with legacy HIP support
|
||||||
|
id: cmake_build_legacy_hip
|
||||||
|
run: |
|
||||||
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
|
||||||
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
@ -693,26 +720,28 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'rpc'
|
- build: 'rpc-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'noavx'
|
- build: 'noavx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2'
|
- build: 'avx2-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx'
|
- build: 'avx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512'
|
- build: 'avx512-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'clblast'
|
- build: 'clblast-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||||
- build: 'openblas'
|
- build: 'openblas-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute'
|
- build: 'kompute-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
- build: 'msvc-arm64'
|
||||||
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -723,13 +752,13 @@ jobs:
|
||||||
|
|
||||||
- name: Clone Kompute submodule
|
- name: Clone Kompute submodule
|
||||||
id: clone_kompute
|
id: clone_kompute
|
||||||
if: ${{ matrix.build == 'kompute' }}
|
if: ${{ matrix.build == 'kompute-x64' }}
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init kompute
|
git submodule update --init kompute
|
||||||
|
|
||||||
- name: Download OpenCL SDK
|
- name: Download OpenCL SDK
|
||||||
id: get_opencl
|
id: get_opencl
|
||||||
if: ${{ matrix.build == 'clblast' }}
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||||
mkdir $env:RUNNER_TEMP/opencl
|
mkdir $env:RUNNER_TEMP/opencl
|
||||||
|
@ -737,7 +766,7 @@ jobs:
|
||||||
|
|
||||||
- name: Download CLBlast
|
- name: Download CLBlast
|
||||||
id: get_clblast
|
id: get_clblast
|
||||||
if: ${{ matrix.build == 'clblast' }}
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||||
|
@ -750,7 +779,7 @@ jobs:
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
if: ${{ matrix.build == 'openblas' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||||
|
@ -763,38 +792,41 @@ jobs:
|
||||||
|
|
||||||
- name: Install Vulkan SDK
|
- name: Install Vulkan SDK
|
||||||
id: get_vulkan
|
id: get_vulkan
|
||||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||||
|
|
||||||
|
- name: Install Ninja
|
||||||
|
id: install_ninja
|
||||||
|
run: |
|
||||||
|
choco install ninja
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
cmake -S . -B build ${{ matrix.defines }}
|
||||||
cd build
|
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
cmake .. ${{ matrix.defines }}
|
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
|
||||||
|
|
||||||
- name: Add clblast.dll
|
- name: Add clblast.dll
|
||||||
id: add_clblast_dll
|
id: add_clblast_dll
|
||||||
if: ${{ matrix.build == 'clblast' }}
|
if: ${{ matrix.build == 'clblast-x64' }}
|
||||||
run: |
|
run: |
|
||||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||||
|
|
||||||
- name: Add libopenblas.dll
|
- name: Add libopenblas.dll
|
||||||
id: add_libopenblas_dll
|
id: add_libopenblas_dll
|
||||||
if: ${{ matrix.build == 'openblas' }}
|
if: ${{ matrix.build == 'openblas-x64' }}
|
||||||
run: |
|
run: |
|
||||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||||
|
|
||||||
- name: Check AVX512F support
|
- name: Check AVX512F support
|
||||||
id: check_avx512f
|
id: check_avx512f
|
||||||
if: ${{ matrix.build == 'avx512' }}
|
if: ${{ matrix.build == 'avx512-x64' }}
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
|
@ -808,14 +840,14 @@ jobs:
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Test (Intel SDE)
|
- name: Test (Intel SDE)
|
||||||
id: cmake_test_sde
|
id: cmake_test_sde
|
||||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||||
run: |
|
run: |
|
||||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||||
# for some weird reason windows tar doesn't like sde tar.xz
|
# for some weird reason windows tar doesn't like sde tar.xz
|
||||||
|
@ -843,14 +875,14 @@ jobs:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||||
name: llama-bin-win-${{ matrix.build }}-x64.zip
|
name: llama-bin-win-${{ matrix.build }}.zip
|
||||||
|
|
||||||
windows-latest-cmake-cuda:
|
windows-latest-cmake-cuda:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
@ -984,6 +1016,37 @@ jobs:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||||
name: llama-bin-win-sycl-x64.zip
|
name: llama-bin-win-sycl-x64.zip
|
||||||
|
|
||||||
|
windows-latest-cmake-hip:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Install
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
write-host "Downloading AMD HIP SDK Installer"
|
||||||
|
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||||
|
write-host "Installing AMD HIP SDK"
|
||||||
|
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||||
|
write-host "Completed AMD HIP SDK installation"
|
||||||
|
|
||||||
|
- name: Verify ROCm
|
||||||
|
id: verify
|
||||||
|
run: |
|
||||||
|
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
|
||||||
|
cmake --build build --config Release
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
|
||||||
|
|
|
@ -555,16 +555,37 @@ if (LLAMA_VULKAN)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (LLAMA_HIPBLAS)
|
||||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
if ($ENV{ROCM_PATH})
|
||||||
|
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||||
|
else()
|
||||||
|
set(ROCM_PATH /opt/rocm)
|
||||||
|
endif()
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||||
|
|
||||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
# CMake on Windows doesn't support the HIP language yet
|
||||||
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
if(WIN32)
|
||||||
|
set(CXX_IS_HIPCC TRUE)
|
||||||
|
else()
|
||||||
|
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(CXX_IS_HIPCC)
|
||||||
|
if(LINUX)
|
||||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
||||||
|
" Prefer setting the HIP compiler directly. See README for details.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
||||||
|
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
||||||
|
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
|
||||||
|
endif()
|
||||||
|
cmake_minimum_required(VERSION 3.21)
|
||||||
|
enable_language(HIP)
|
||||||
|
endif()
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
find_package(hipblas REQUIRED)
|
find_package(hipblas REQUIRED)
|
||||||
find_package(rocblas REQUIRED)
|
find_package(rocblas REQUIRED)
|
||||||
|
@ -598,13 +619,18 @@ if (LLAMA_HIPBLAS)
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
|
||||||
|
if (CXX_IS_HIPCC)
|
||||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
|
||||||
|
else()
|
||||||
|
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_SYCL)
|
if (LLAMA_SYCL)
|
||||||
|
@ -1007,6 +1033,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||||
endif ()
|
endif ()
|
||||||
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
|
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||||
|
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||||
|
endif ()
|
||||||
|
|
||||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||||
|
|
45
CMakePresets.json
Normal file
45
CMakePresets.json
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
{
|
||||||
|
"version": 4,
|
||||||
|
"configurePresets": [
|
||||||
|
{
|
||||||
|
"name": "base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
|
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
|
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"name": "arm64-windows-llvm", "hidden": true,
|
||||||
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
|
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||||
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||||
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||||
|
]
|
||||||
|
}
|
6
Makefile
6
Makefile
|
@ -560,10 +560,10 @@ endif # LLAMA_VULKAN
|
||||||
ifdef LLAMA_HIPBLAS
|
ifdef LLAMA_HIPBLAS
|
||||||
ifeq ($(wildcard /opt/rocm),)
|
ifeq ($(wildcard /opt/rocm),)
|
||||||
ROCM_PATH ?= /usr
|
ROCM_PATH ?= /usr
|
||||||
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
|
||||||
else
|
else
|
||||||
ROCM_PATH ?= /opt/rocm
|
ROCM_PATH ?= /opt/rocm
|
||||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||||
endif
|
endif
|
||||||
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
|
||||||
LLAMA_CUDA_DMMV_X ?= 32
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
|
@ -575,7 +575,7 @@ ifdef LLAMA_HIP_UMA
|
||||||
endif # LLAMA_HIP_UMA
|
endif # LLAMA_HIP_UMA
|
||||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||||
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
|
|
30
README.md
30
README.md
|
@ -528,13 +528,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```
|
```
|
||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
||||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||||
|
|
||||||
|
Note that if you get the following error:
|
||||||
|
```
|
||||||
|
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
|
||||||
|
```
|
||||||
|
Try searching for a directory under `HIP_PATH` that contains the file
|
||||||
|
`oclc_abi_version_400.bc`. Then, add the following to the start of the
|
||||||
|
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
|
||||||
|
like:
|
||||||
|
```bash
|
||||||
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
|
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
|
&& cmake --build build -- -j 16
|
||||||
|
```
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
|
@ -543,10 +558,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
mkdir build
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cd build
|
cmake --build build
|
||||||
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
|
|
||||||
cmake --build .
|
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
|
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
|
||||||
|
@ -712,6 +725,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
### Prepare and Quantize
|
### Prepare and Quantize
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
16
cmake/arm64-windows-llvm.cmake
Normal file
16
cmake/arm64-windows-llvm.cmake
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
set( CMAKE_SYSTEM_NAME Windows )
|
||||||
|
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||||
|
|
||||||
|
set( target arm64-pc-windows-msvc )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER clang )
|
||||||
|
set( CMAKE_CXX_COMPILER clang++ )
|
||||||
|
|
||||||
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
|
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
||||||
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||||
|
|
||||||
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
6
cmake/arm64-windows-msvc.cmake
Normal file
6
cmake/arm64-windows-msvc.cmake
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
set( CMAKE_SYSTEM_NAME Windows )
|
||||||
|
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||||
|
|
||||||
|
set( target arm64-pc-windows-msvc )
|
||||||
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
|
@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
size_t pos_start = 0;
|
size_t pos_start = 0;
|
||||||
size_t pos_found = 0;
|
size_t pos_found = 0;
|
||||||
|
|
||||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
||||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||||
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||||
|
|
|
@ -26,7 +26,7 @@ namespace grammar_parser {
|
||||||
|
|
||||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||||
return result.first->second;
|
return result.first->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -272,7 +272,7 @@ private:
|
||||||
if (literal.empty()) {
|
if (literal.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ret.push_back(std::make_pair(literal, true));
|
ret.emplace_back(literal, true);
|
||||||
literal.clear();
|
literal.clear();
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
@ -298,7 +298,7 @@ private:
|
||||||
while (i < length) {
|
while (i < length) {
|
||||||
char c = sub_pattern[i];
|
char c = sub_pattern[i];
|
||||||
if (c == '.') {
|
if (c == '.') {
|
||||||
seq.push_back(std::make_pair(get_dot(), false));
|
seq.emplace_back(get_dot(), false);
|
||||||
i++;
|
i++;
|
||||||
} else if (c == '(') {
|
} else if (c == '(') {
|
||||||
i++;
|
i++;
|
||||||
|
@ -307,7 +307,7 @@ private:
|
||||||
_warnings.push_back("Unsupported pattern syntax");
|
_warnings.push_back("Unsupported pattern syntax");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
|
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||||
} else if (c == ')') {
|
} else if (c == ')') {
|
||||||
i++;
|
i++;
|
||||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||||
|
@ -331,9 +331,9 @@ private:
|
||||||
}
|
}
|
||||||
square_brackets += ']';
|
square_brackets += ']';
|
||||||
i++;
|
i++;
|
||||||
seq.push_back(std::make_pair(square_brackets, false));
|
seq.emplace_back(square_brackets, false);
|
||||||
} else if (c == '|') {
|
} else if (c == '|') {
|
||||||
seq.push_back(std::make_pair("|", false));
|
seq.emplace_back("|", false);
|
||||||
i++;
|
i++;
|
||||||
} else if (c == '*' || c == '+' || c == '?') {
|
} else if (c == '*' || c == '+' || c == '?') {
|
||||||
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
||||||
|
@ -417,7 +417,7 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!literal.empty()) {
|
if (!literal.empty()) {
|
||||||
seq.push_back(std::make_pair(literal, true));
|
seq.emplace_back(literal, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
10
common/log.h
10
common/log.h
|
@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||||
#else
|
#else
|
||||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define LOG_FLF_FMT "%s"
|
#define LOG_FLF_FMT "%s"
|
||||||
|
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||||
#else
|
#else
|
||||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define LOG_TEE_FLF_FMT "%s"
|
#define LOG_TEE_FLF_FMT "%s"
|
||||||
|
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||||
// Main LOG macro.
|
// Main LOG macro.
|
||||||
// behaves like printf, and supports arguments the exact same way.
|
// behaves like printf, and supports arguments the exact same way.
|
||||||
//
|
//
|
||||||
#ifndef _MSC_VER
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||||
#else
|
#else
|
||||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||||
|
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||||
// Secondary target can be changed just like LOG_TARGET
|
// Secondary target can be changed just like LOG_TARGET
|
||||||
// by defining LOG_TEE_TARGET
|
// by defining LOG_TEE_TARGET
|
||||||
//
|
//
|
||||||
#ifndef _MSC_VER
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||||
#else
|
#else
|
||||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// LOG macro variants with auto endline.
|
// LOG macro variants with auto endline.
|
||||||
#ifndef _MSC_VER
|
#if !defined(_MSC_VER) || defined(__clang__)
|
||||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -20,11 +20,13 @@
|
||||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
# TODO: automate the update of convert-hf-to-gguf.py
|
|
||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||||
|
sess = requests.Session()
|
||||||
|
|
||||||
|
|
||||||
class TOKENIZER_TYPE(IntEnum):
|
class TOKENIZER_TYPE(IntEnum):
|
||||||
|
@ -79,63 +82,44 @@ models = [
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
|
||||||
if not os.path.exists("models/tokenizers"):
|
|
||||||
os.makedirs("models/tokenizers")
|
|
||||||
|
|
||||||
|
|
||||||
def download_file_with_auth(url, token, save_path):
|
def download_file_with_auth(url, token, save_path):
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
response = requests.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
if response.status_code == 200:
|
response.raise_for_status()
|
||||||
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
else:
|
|
||||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
|
||||||
|
|
||||||
|
|
||||||
# download the tokenizer models
|
def download_model(model):
|
||||||
for model in models:
|
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
repo = model["repo"]
|
repo = model["repo"]
|
||||||
tokt = model["tokt"]
|
tokt = model["tokt"]
|
||||||
|
|
||||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||||
os.makedirs(f"models/tokenizers/{name}")
|
|
||||||
else:
|
|
||||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
|
||||||
|
|
||||||
url = f"{repo}/raw/main/config.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/config.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
url = f"{repo}/raw/main/tokenizer.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
|
||||||
if os.path.getsize(save_path) < 1024:
|
|
||||||
# remove the file
|
|
||||||
os.remove(save_path)
|
|
||||||
url = f"{repo}/resolve/main/tokenizer.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
|
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||||
if tokt == TOKENIZER_TYPE.SPM:
|
if tokt == TOKENIZER_TYPE.SPM:
|
||||||
url = f"{repo}/resolve/main/tokenizer.model"
|
files.append("tokenizer.model")
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
for file in files:
|
||||||
|
save_path = f"models/tokenizers/{name}/{file}"
|
||||||
|
if os.path.isfile(save_path):
|
||||||
|
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||||
|
continue
|
||||||
|
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||||
|
|
||||||
|
|
||||||
|
for model in models:
|
||||||
|
try:
|
||||||
|
download_model(model)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||||
|
|
||||||
url = f"{repo}/raw/main/tokenizer_config.json"
|
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
|
|
||||||
download_file_with_auth(url, token, save_path)
|
|
||||||
|
|
||||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
|
||||||
|
|
||||||
src_ifs = ""
|
src_ifs = ""
|
||||||
for model in models:
|
for model in models:
|
||||||
|
@ -224,11 +208,18 @@ src_func = f"""
|
||||||
return res
|
return res
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print(src_func) # noqa: NP100
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
|
convert_py = convert_py_pth.read_text()
|
||||||
|
convert_py = re.sub(
|
||||||
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
convert_py,
|
||||||
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("\n")
|
convert_py_pth.write_text(convert_py)
|
||||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
|
||||||
logger.info("\n")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
# generate tests for each tokenizer model
|
# generate tests for each tokenizer model
|
||||||
|
|
||||||
|
|
|
@ -402,6 +402,7 @@ class Model:
|
||||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||||
# do not modify it manually!
|
# do not modify it manually!
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
|
# Marker: Start get_vocab_base_pre
|
||||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
|
@ -489,6 +490,7 @@ class Model:
|
||||||
logger.debug(f"chkhsh: {chkhsh}")
|
logger.debug(f"chkhsh: {chkhsh}")
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
# Marker: End get_vocab_base_pre
|
||||||
|
|
||||||
def _set_vocab_gpt2(self) -> None:
|
def _set_vocab_gpt2(self) -> None:
|
||||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||||
|
@ -526,7 +528,7 @@ class Model:
|
||||||
|
|
||||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||||
added_vocab = tokenizer.special_tokens
|
added_vocab = tokenizer.special_tokens
|
||||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
|
|
|
@ -1109,7 +1109,7 @@ class OutputFile:
|
||||||
if metadata is not None and metadata.name is not None:
|
if metadata is not None and metadata.name is not None:
|
||||||
name = metadata.name
|
name = metadata.name
|
||||||
elif params.path_model is not None:
|
elif params.path_model is not None:
|
||||||
name = str(params.path_model.parent).split("/")[-1]
|
name = params.path_model.name
|
||||||
elif params.n_ctx == 4096:
|
elif params.n_ctx == 4096:
|
||||||
# Heuristic detection of LLaMA v2 model
|
# Heuristic detection of LLaMA v2 model
|
||||||
name = "LLaMA v2"
|
name = "LLaMA v2"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# Debugging Tests Tips
|
# Debugging Tests Tips
|
||||||
|
|
||||||
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
|
||||||
|
|
||||||
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||||
|
|
||||||
|
@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
|
||||||
|
|
||||||
It will then build & run in the debugger for you.
|
It will then build & run in the debugger for you.
|
||||||
|
|
||||||
|
To just execute a test and get back a PASS or FAIL message run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./scripts/debug-test.sh test-tokenizer
|
./scripts/debug-test.sh test-tokenizer
|
||||||
|
```
|
||||||
|
|
||||||
|
To test in GDB use the `-g` flag to enable gdb test mode.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/debug-test.sh -g test-tokenizer
|
||||||
|
|
||||||
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||||
>>> b main
|
>>> b main
|
||||||
```
|
```
|
||||||
|
|
||||||
|
To speed up the testing loop, if you know your test number you can just run it similar to below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/debug-test.sh test 23
|
||||||
|
```
|
||||||
|
|
||||||
For further reference use `debug-test.sh -h` to print help.
|
For further reference use `debug-test.sh -h` to print help.
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 3.1: Identify Test Command for Debugging
|
#### Step 3: Find all tests available that matches REGEX
|
||||||
|
|
||||||
The output of this command will give you the command & arguments needed to run GDB.
|
The output of this command will give you the command & arguments needed to run GDB.
|
||||||
|
|
||||||
|
@ -69,11 +83,13 @@ Labels: main
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
So for test #1 we can tell these two pieces of relevant information:
|
#### Step 4: Identify Test Command for Debugging
|
||||||
|
|
||||||
|
So for test #1 above we can tell these two pieces of relevant information:
|
||||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||||
|
|
||||||
#### Step 3.2: Run GDB on test command
|
#### Step 5: Run GDB on test command
|
||||||
|
|
||||||
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# quantize
|
# quantize
|
||||||
|
|
||||||
TODO
|
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
|
||||||
|
|
||||||
|
Note: It is synced from llama.cpp `main` every 6 hours.
|
||||||
|
|
||||||
## Llama 2 7B
|
## Llama 2 7B
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ cmake --build . --config Release
|
||||||
Then, start the `rpc-server` with the backend:
|
Then, start the `rpc-server` with the backend:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ bin/rpc-server 0.0.0.0 50052
|
$ bin/rpc-server -p 50052
|
||||||
create_backend: using CUDA backend
|
create_backend: using CUDA backend
|
||||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||||
|
@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
|
||||||
|
|
||||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||||
```bash
|
```bash
|
||||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||||
```
|
```
|
||||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
|
@ -7,9 +7,64 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
|
#ifdef _WIN32
|
||||||
|
# include <windows.h>
|
||||||
|
#else
|
||||||
|
# include <unistd.h>
|
||||||
|
#endif
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
struct rpc_server_params {
|
||||||
|
std::string host = "0.0.0.0";
|
||||||
|
int port = 50052;
|
||||||
|
size_t backend_mem = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
||||||
|
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
||||||
|
fprintf(stderr, "options:\n");
|
||||||
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
|
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||||
|
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||||
|
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
||||||
|
std::string arg;
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
arg = argv[i];
|
||||||
|
if (arg == "-H" || arg == "--host") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
params.host = argv[i];
|
||||||
|
} else if (arg == "-p" || arg == "--port") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
params.port = std::stoi(argv[i]);
|
||||||
|
if (params.port <= 0 || params.port > 65535) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if (arg == "-m" || arg == "--mem") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
||||||
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
exit(0);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
print_usage(argc, argv, params);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_t create_backend() {
|
static ggml_backend_t create_backend() {
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
|
@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||||
#else
|
#else
|
||||||
// TODO: implement for other backends
|
#ifdef _WIN32
|
||||||
*free_mem = 1;
|
MEMORYSTATUSEX status;
|
||||||
*total_mem = 1;
|
status.dwLength = sizeof(status);
|
||||||
|
GlobalMemoryStatusEx(&status);
|
||||||
|
*total_mem = status.ullTotalPhys;
|
||||||
|
*free_mem = status.ullAvailPhys;
|
||||||
|
#else
|
||||||
|
long pages = sysconf(_SC_PHYS_PAGES);
|
||||||
|
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||||
|
*total_mem = pages * page_size;
|
||||||
|
*free_mem = *total_mem;
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
if (argc < 3) {
|
rpc_server_params params;
|
||||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
if (!rpc_server_params_parse(argc, argv, params)) {
|
||||||
return 1;
|
fprintf(stderr, "Invalid parameters\n");
|
||||||
}
|
|
||||||
const char * host = argv[1];
|
|
||||||
int port = std::stoi(argv[2]);
|
|
||||||
if (port <= 0 || port > 65535) {
|
|
||||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
ggml_backend_t backend = create_backend();
|
ggml_backend_t backend = create_backend();
|
||||||
|
@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
|
||||||
fprintf(stderr, "Failed to create backend\n");
|
fprintf(stderr, "Failed to create backend\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
printf("Starting RPC server on %s:%d\n", host, port);
|
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
||||||
size_t free_mem, total_mem;
|
size_t free_mem, total_mem;
|
||||||
|
if (params.backend_mem > 0) {
|
||||||
|
free_mem = params.backend_mem;
|
||||||
|
total_mem = params.backend_mem;
|
||||||
|
} else {
|
||||||
get_backend_memory(&free_mem, &total_mem);
|
get_backend_memory(&free_mem, &total_mem);
|
||||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
}
|
||||||
|
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -17,7 +17,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
|
|
||||||
**Command line options:**
|
**Command line options:**
|
||||||
|
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
|
||||||
|
- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
||||||
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
|
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
||||||
- `--numa distribute`: Spread execution evenly over all nodes
|
- `--numa distribute`: Spread execution evenly over all nodes
|
||||||
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
||||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
|
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||||
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
|
||||||
|
|
||||||
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
||||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||||
|
|
|
@ -293,14 +293,13 @@ def start_server_background(args):
|
||||||
|
|
||||||
|
|
||||||
def is_server_listening(server_fqdn, server_port):
|
def is_server_listening(server_fqdn, server_port):
|
||||||
try:
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||||
url = f"{server_fqdn}:{server_port}/health"
|
result = sock.connect_ex((server_fqdn, server_port))
|
||||||
if not url.startswith("http://"):
|
_is_server_listening = result == 0
|
||||||
url = f"http://{url}"
|
if _is_server_listening:
|
||||||
result = requests.get(url)
|
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||||
return result.status_code == 200
|
return _is_server_listening
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def escape_metric_name(metric_name):
|
def escape_metric_name(metric_name):
|
||||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||||
|
|
|
@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||||
|
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||||
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
||||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||||
|
@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
sparams.port = std::stoi(argv[i]);
|
sparams.port = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--rpc") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.rpc_servers = argv[i];
|
||||||
} else if (arg == "--host") {
|
} else if (arg == "--host") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
|
@ -1986,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
if (quant_weights) {
|
if (quant_weights) {
|
||||||
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
const float * qw = quant_weights + QK_K * i + 16*j;
|
||||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
||||||
} else {
|
} else {
|
||||||
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
||||||
|
@ -3487,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (nrc == 2) {
|
if (nrc == 2) {
|
||||||
const block_q4_0 * restrict vx0 = vx;
|
const block_q4_0 * restrict vx0 = vx;
|
||||||
const block_q4_0 * restrict vx1 = vx + bx;
|
const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
||||||
|
|
||||||
const block_q8_0 * restrict vy0 = vy;
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
const block_q8_0 * restrict vy1 = vy + by;
|
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||||
|
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
@ -3524,11 +3523,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
|
||||||
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
||||||
|
@ -3894,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (nrc == 2) {
|
if (nrc == 2) {
|
||||||
const block_q4_1 * restrict vx0 = vx;
|
const block_q4_1 * restrict vx0 = vx;
|
||||||
const block_q4_1 * restrict vx1 = vx + bx;
|
const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
||||||
const block_q8_1 * restrict vy0 = vy;
|
const block_q8_1 * restrict vy0 = vy;
|
||||||
const block_q8_1 * restrict vy1 = vy + by;
|
const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
||||||
|
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||||
|
@ -3907,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const block_q8_1 * restrict b_y0 = &vy0[i];
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||||
const block_q8_1 * restrict b_y1 = &vy1[i];
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||||
|
|
||||||
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
||||||
summs0 += summs_t;
|
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||||
|
|
||||||
|
@ -3931,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
// mmla into int32x4_t
|
// mmla into int32x4_t
|
||||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
||||||
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
@ -3953,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||||
sumv2 = sumv2 + summs0;
|
sumv2 = vaddq_f32(sumv2, summs0);
|
||||||
|
|
||||||
vst1_f32(s, vget_low_f32(sumv2));
|
vst1_f32(s, vget_low_f32(sumv2));
|
||||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||||
|
@ -4837,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (nrc == 2) {
|
if (nrc == 2) {
|
||||||
const block_q8_0 * restrict vx0 = vx;
|
const block_q8_0 * restrict vx0 = vx;
|
||||||
const block_q8_0 * restrict vx1 = vx + bx;
|
const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
||||||
const block_q8_0 * restrict vy0 = vy;
|
const block_q8_0 * restrict vy0 = vy;
|
||||||
const block_q8_0 * restrict vy1 = vy + by;
|
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||||
|
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
@ -4861,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||||
|
|
||||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||||
|
float32x4_t scale = vld1q_f32(_scale);
|
||||||
|
|
||||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||||
|
|
15
ggml-rpc.cpp
15
ggml-rpc.cpp
|
@ -28,7 +28,7 @@
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
#define GGML_DEBUG 1
|
#define GGML_DEBUG 0
|
||||||
#if (GGML_DEBUG >= 1)
|
#if (GGML_DEBUG >= 1)
|
||||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||||
#else
|
#else
|
||||||
|
@ -134,7 +134,13 @@ static bool set_no_delay(sockfd_t sockfd) {
|
||||||
int flag = 1;
|
int flag = 1;
|
||||||
// set TCP_NODELAY to disable Nagle's algorithm
|
// set TCP_NODELAY to disable Nagle's algorithm
|
||||||
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
||||||
return ret >= 0;
|
return ret == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool set_reuse_addr(sockfd_t sockfd) {
|
||||||
|
int flag = 1;
|
||||||
|
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
|
||||||
|
return ret == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||||
|
@ -181,7 +187,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
if (!set_reuse_addr(sockfd)) {
|
||||||
|
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
struct sockaddr_in serv_addr;
|
struct sockaddr_in serv_addr;
|
||||||
serv_addr.sin_family = AF_INET;
|
serv_addr.sin_family = AF_INET;
|
||||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||||
|
|
460
ggml.c
460
ggml.c
|
@ -165,9 +165,6 @@ void ggml_print_backtrace(void) {
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
#define GGML_SILU_FP16
|
|
||||||
// #define GGML_CROSS_ENTROPY_EXP_FP16
|
|
||||||
// #define GGML_FLASH_ATTN_EXP_FP16
|
|
||||||
|
|
||||||
#define GGML_SOFT_MAX_UNROLL 4
|
#define GGML_SOFT_MAX_UNROLL 4
|
||||||
#define GGML_VEC_DOT_UNROLL 2
|
#define GGML_VEC_DOT_UNROLL 2
|
||||||
|
@ -318,12 +315,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||||
// precomputed quick gelu table for f16 (128 KB)
|
// precomputed quick gelu table for f16 (128 KB)
|
||||||
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||||
|
|
||||||
// precomputed silu table for f16 (128 KB)
|
|
||||||
static ggml_fp16_t ggml_table_silu_f16[1 << 16];
|
|
||||||
|
|
||||||
// precomputed exp table for f16 (128 KB)
|
|
||||||
static ggml_fp16_t ggml_table_exp_f16[1 << 16];
|
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||||
float ggml_table_f32_f16[1 << 16];
|
float ggml_table_f32_f16[1 << 16];
|
||||||
|
|
||||||
|
@ -2085,52 +2076,291 @@ inline static float ggml_silu_f32(float x) {
|
||||||
return x/(1.0f + expf(-x));
|
return x/(1.0f + expf(-x));
|
||||||
}
|
}
|
||||||
|
|
||||||
//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
#if defined(__ARM_NEON)
|
||||||
// const uint16_t * i16 = (const uint16_t *) x;
|
|
||||||
// for (int i = 0; i < n; ++i) {
|
|
||||||
// y[i] = ggml_table_silu_f16[i16[i]];
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
#ifdef GGML_SILU_FP16
|
// adapted from arm limited optimized routine
|
||||||
inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
// the maximum error is 1.45358 plus 0.5 ulps
|
||||||
uint16_t t;
|
// numbers above 88.38 will flush to infinity
|
||||||
for (int i = 0; i < n; ++i) {
|
// numbers beneath -103.97 will flush to zero
|
||||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
||||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
||||||
y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
|
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
||||||
|
const float32x4_t n = vsubq_f32(z, r);
|
||||||
|
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
||||||
|
vdupq_n_f32(0x1.7f7d1cp-20f));
|
||||||
|
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
||||||
|
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
||||||
|
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
||||||
|
const float32x4_t u = vmulq_f32(b, b);
|
||||||
|
const float32x4_t j = vfmaq_f32(
|
||||||
|
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
||||||
|
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
||||||
|
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
||||||
|
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
||||||
|
return vfmaq_f32(k, j, k);
|
||||||
|
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
||||||
|
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
||||||
|
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
||||||
|
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
||||||
|
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computes silu x/(1+exp(-x)) in single precision vector
|
||||||
|
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
||||||
|
const float32x4_t one = vdupq_n_f32(1.0f);
|
||||||
|
const float32x4_t zero = vdupq_n_f32(0.0f);
|
||||||
|
const float32x4_t neg_x = vsubq_f32(zero, x);
|
||||||
|
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
||||||
|
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
||||||
|
return vdivq_f32(x, one_plus_exp_neg_x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||||
|
|
||||||
|
// adapted from arm limited optimized routine
|
||||||
|
// the maximum error is 1.45358 plus 0.5 ulps
|
||||||
|
// numbers above 88.38 will flush to infinity
|
||||||
|
// numbers beneath -103.97 will flush to zero
|
||||||
|
inline static __m512 ggml_v_expf(__m512 x) {
|
||||||
|
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
||||||
|
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
||||||
|
const __m512 n = _mm512_sub_ps(z, r);
|
||||||
|
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
||||||
|
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
||||||
|
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
||||||
|
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
||||||
|
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
||||||
|
const __m512 u = _mm512_mul_ps(b, b);
|
||||||
|
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
||||||
|
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
||||||
|
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
||||||
|
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
||||||
|
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
||||||
|
if (_mm512_kortestz(c, c))
|
||||||
|
return _mm512_fmadd_ps(j, k, k);
|
||||||
|
const __m512i g = _mm512_and_si512(
|
||||||
|
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
||||||
|
_mm512_set1_epi32(0x82000000u));
|
||||||
|
const __m512 s1 =
|
||||||
|
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
||||||
|
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
||||||
|
const __mmask16 d =
|
||||||
|
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
||||||
|
return _mm512_mask_blend_ps(
|
||||||
|
d, _mm512_mask_blend_ps(
|
||||||
|
c, _mm512_fmadd_ps(k, j, k),
|
||||||
|
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
||||||
|
_mm512_mul_ps(s1, s1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// computes silu x/(1+exp(-x)) in single precision vector
|
||||||
|
inline static __m512 ggml_v_silu(__m512 x) {
|
||||||
|
const __m512 one = _mm512_set1_ps(1);
|
||||||
|
const __m512 zero = _mm512_setzero_ps();
|
||||||
|
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
||||||
|
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
||||||
|
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
||||||
|
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__AVX2__) && defined(__FMA__)
|
||||||
|
|
||||||
|
// adapted from arm limited optimized routine
|
||||||
|
// the maximum error is 1.45358 plus 0.5 ulps
|
||||||
|
// numbers above 88.38 will flush to infinity
|
||||||
|
// numbers beneath -103.97 will flush to zero
|
||||||
|
inline static __m256 ggml_v_expf(__m256 x) {
|
||||||
|
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
||||||
|
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
||||||
|
const __m256 n = _mm256_sub_ps(z, r);
|
||||||
|
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
||||||
|
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
||||||
|
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
||||||
|
const __m256 k = _mm256_castsi256_ps(
|
||||||
|
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
||||||
|
const __m256i c = _mm256_castps_si256(
|
||||||
|
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
||||||
|
_mm256_set1_ps(126), _CMP_GT_OQ));
|
||||||
|
const __m256 u = _mm256_mul_ps(b, b);
|
||||||
|
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
||||||
|
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
||||||
|
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
||||||
|
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
||||||
|
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
||||||
|
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
||||||
|
return _mm256_fmadd_ps(j, k, k);
|
||||||
|
const __m256i g = _mm256_and_si256(
|
||||||
|
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
||||||
|
_mm256_set1_epi32(0x82000000u));
|
||||||
|
const __m256 s1 =
|
||||||
|
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
||||||
|
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
||||||
|
const __m256i d = _mm256_castps_si256(
|
||||||
|
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
||||||
|
_mm256_set1_ps(192), _CMP_GT_OQ));
|
||||||
|
return _mm256_or_ps(
|
||||||
|
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
||||||
|
_mm256_andnot_ps(
|
||||||
|
_mm256_castsi256_ps(d),
|
||||||
|
_mm256_or_ps(
|
||||||
|
_mm256_and_ps(_mm256_castsi256_ps(c),
|
||||||
|
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
||||||
|
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
||||||
|
}
|
||||||
|
|
||||||
|
// computes silu x/(1+exp(-x)) in single precision vector
|
||||||
|
inline static __m256 ggml_v_silu(__m256 x) {
|
||||||
|
const __m256 one = _mm256_set1_ps(1);
|
||||||
|
const __m256 zero = _mm256_setzero_ps();
|
||||||
|
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
||||||
|
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
||||||
|
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
||||||
|
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
||||||
|
|
||||||
|
#if defined(__FMA__)
|
||||||
|
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
||||||
|
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
||||||
#else
|
#else
|
||||||
inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
||||||
for (int i = 0; i < n; ++i) {
|
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// adapted from arm limited optimized routine
|
||||||
|
// the maximum error is 1.45358 plus 0.5 ulps
|
||||||
|
// numbers above 88.38 will flush to infinity
|
||||||
|
// numbers beneath -103.97 will flush to zero
|
||||||
|
inline static __m128 ggml_v_expf(__m128 x) {
|
||||||
|
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
||||||
|
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
||||||
|
const __m128 n = _mm_sub_ps(z, r);
|
||||||
|
const __m128 b =
|
||||||
|
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
||||||
|
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
||||||
|
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
||||||
|
const __m128i c =
|
||||||
|
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
||||||
|
const __m128 u = _mm_mul_ps(b, b);
|
||||||
|
const __m128 j =
|
||||||
|
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
||||||
|
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
||||||
|
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
||||||
|
if (!_mm_movemask_epi8(c))
|
||||||
|
return MADD128(j, k, k);
|
||||||
|
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
||||||
|
_mm_set1_epi32(0x82000000u));
|
||||||
|
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
||||||
|
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
||||||
|
const __m128i d =
|
||||||
|
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
||||||
|
return _mm_or_ps(
|
||||||
|
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
||||||
|
_mm_andnot_ps(_mm_castsi128_ps(d),
|
||||||
|
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
||||||
|
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
||||||
|
}
|
||||||
|
|
||||||
|
// computes silu x/(1+exp(-x)) in single precision vector
|
||||||
|
inline static __m128 ggml_v_silu(__m128 x) {
|
||||||
|
const __m128 one = _mm_set1_ps(1);
|
||||||
|
const __m128 zero = _mm_setzero_ps();
|
||||||
|
const __m128 neg_x = _mm_sub_ps(zero, x);
|
||||||
|
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
||||||
|
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
||||||
|
return _mm_div_ps(x, one_plus_exp_neg_x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
||||||
|
|
||||||
|
static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
||||||
|
int i = 0;
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||||
|
for (; i + 15 < n; i += 16) {
|
||||||
|
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__) && defined(__FMA__)
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
||||||
|
}
|
||||||
|
#elif defined(__SSE2__)
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
||||||
|
}
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; i < n; ++i) {
|
||||||
y[i] = ggml_silu_f32(x[i]);
|
y[i] = ggml_silu_f32(x[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
||||||
|
int i = 0;
|
||||||
|
ggml_float sum = 0;
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||||
|
for (; i + 15 < n; i += 16) {
|
||||||
|
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
||||||
|
_mm512_set1_ps(max)));
|
||||||
|
_mm512_storeu_ps(y + i, val);
|
||||||
|
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__) && defined(__FMA__)
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
||||||
|
_mm256_set1_ps(max)));
|
||||||
|
_mm256_storeu_ps(y + i, val);
|
||||||
|
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
||||||
|
_mm256_castps256_ps128(val));
|
||||||
|
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
||||||
|
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
||||||
|
sum += (ggml_float)_mm_cvtss_f32(val2);
|
||||||
|
}
|
||||||
|
#elif defined(__SSE2__)
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
||||||
|
_mm_set1_ps(max)));
|
||||||
|
_mm_storeu_ps(y + i, val);
|
||||||
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||||
|
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
||||||
|
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
||||||
|
#else
|
||||||
|
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
||||||
|
val = _mm_add_ps(val, tmp);
|
||||||
|
tmp = _mm_movehl_ps(tmp, val);
|
||||||
|
val = _mm_add_ss(val, tmp);
|
||||||
#endif
|
#endif
|
||||||
|
sum += (ggml_float)_mm_cvtss_f32(val);
|
||||||
|
}
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
||||||
|
vdupq_n_f32(max)));
|
||||||
|
vst1q_f32(y + i, val);
|
||||||
|
sum += (ggml_float)vaddvq_f32(val);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
float val = expf(x[i] - max);
|
||||||
|
sum += (ggml_float)val;
|
||||||
|
y[i] = val;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
inline static float ggml_silu_backward_f32(float x, float dy) {
|
inline static float ggml_silu_backward_f32(float x, float dy) {
|
||||||
const float s = 1.0f/(1.0f + expf(-x));
|
const float s = 1.0f/(1.0f + expf(-x));
|
||||||
return dy*s*(1.0f + x*(1.0f - s));
|
return dy*s*(1.0f + x*(1.0f - s));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_SILU_FP16
|
|
||||||
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
|
||||||
for (int i = 0; i < n; ++i) {
|
|
||||||
// we did not use x[i] to compute forward silu but its f16 equivalent
|
|
||||||
// take derivative at f16 of x[i]:
|
|
||||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
|
||||||
float usedx = GGML_FP16_TO_FP32(fp16);
|
|
||||||
dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
||||||
#ifndef GGML_USE_ACCELERATE
|
#ifndef GGML_USE_ACCELERATE
|
||||||
|
@ -2922,8 +3152,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
||||||
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
||||||
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
||||||
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
|
||||||
ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||||
|
@ -13600,22 +13828,7 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
ggml_vec_max_f32(nc, &max, wp);
|
ggml_vec_max_f32(nc, &max, wp);
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
ggml_float sum = ggml_vec_soft_max_f32(nc, dp, wp, max);
|
||||||
|
|
||||||
uint16_t scvt;
|
|
||||||
for (int i = 0; i < nc; i++) {
|
|
||||||
if (wp[i] == -INFINITY) {
|
|
||||||
dp[i] = 0.0f;
|
|
||||||
} else {
|
|
||||||
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
|
||||||
memcpy(&scvt, &s, sizeof(scvt));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
|
||||||
sum += (ggml_float)val;
|
|
||||||
dp[i] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(sum > 0.0);
|
assert(sum > 0.0);
|
||||||
|
|
||||||
sum = 1.0/sum;
|
sum = 1.0/sum;
|
||||||
|
@ -15374,37 +15587,7 @@ static void ggml_compute_forward_flash_attn_f32(
|
||||||
vvexpf(S, S, &Mup);
|
vvexpf(S, S, &Mup);
|
||||||
ggml_vec_sum_f32(Mup, &sum, S);
|
ggml_vec_sum_f32(Mup, &sum, S);
|
||||||
#else
|
#else
|
||||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||||
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
|
||||||
|
|
||||||
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
||||||
if (i >= masked_begin) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
float * SS = S + i;
|
|
||||||
|
|
||||||
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
|
||||||
if (i + j >= masked_begin) {
|
|
||||||
break;
|
|
||||||
} else if (SS[j] == -INFINITY) {
|
|
||||||
SS[j] = 0.0f;
|
|
||||||
} else {
|
|
||||||
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
|
||||||
const float val = expf(SS[j] - max);
|
|
||||||
#else
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
|
||||||
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
|
||||||
#endif
|
|
||||||
sump[j] += (ggml_float)val;
|
|
||||||
SS[j] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
|
||||||
sum += sump[i];
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15586,28 +15769,7 @@ static void ggml_compute_forward_flash_attn_f16(
|
||||||
vvexpf(S, S, &Mup);
|
vvexpf(S, S, &Mup);
|
||||||
ggml_vec_sum_f32(Mup, &sum, S);
|
ggml_vec_sum_f32(Mup, &sum, S);
|
||||||
#else
|
#else
|
||||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL];
|
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||||
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
|
||||||
|
|
||||||
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
||||||
float * SS = S + i;
|
|
||||||
|
|
||||||
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
|
||||||
if (SS[j] == -INFINITY) {
|
|
||||||
SS[j] = 0.0f;
|
|
||||||
} else {
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
|
|
||||||
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
|
||||||
sump[j] += (ggml_float)val;
|
|
||||||
SS[j] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
|
||||||
sum += sump[i];
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16234,38 +16396,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
||||||
vvexpf(SM, SM, &Mup);
|
vvexpf(SM, SM, &Mup);
|
||||||
ggml_vec_sum_f32(Mup, &sum, SM);
|
ggml_vec_sum_f32(Mup, &sum, SM);
|
||||||
#else
|
#else
|
||||||
uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
|
sum = ggml_vec_soft_max_f32(Mup, SM, S, max);
|
||||||
ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
|
|
||||||
|
|
||||||
for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
|
|
||||||
if (i >= masked_begin) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
float * SR = S + i;
|
|
||||||
float * SW = SM + i;
|
|
||||||
|
|
||||||
for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
|
|
||||||
if (i + j >= masked_begin) {
|
|
||||||
break;
|
|
||||||
} else if (SR[j] == -INFINITY) {
|
|
||||||
SW[j] = 0.0f;
|
|
||||||
} else {
|
|
||||||
#ifndef GGML_FLASH_ATTN_EXP_FP16
|
|
||||||
const float val = expf(SR[j] - max);
|
|
||||||
#else
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
|
|
||||||
memcpy(&scvt[j], &s, sizeof(uint16_t));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
|
|
||||||
#endif
|
|
||||||
sump[j] += (ggml_float)val;
|
|
||||||
SW[j] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
|
|
||||||
sum += sump[i];
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17291,35 +17422,15 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
||||||
assert(!isnan(s1[i]));
|
assert(!isnan(s1[i]));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// soft_max
|
// soft_max
|
||||||
ggml_float sum = 0.0;
|
|
||||||
{
|
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
ggml_vec_max_f32(nc, &max, s0);
|
ggml_vec_max_f32(nc, &max, s0);
|
||||||
|
ggml_float sum = ggml_vec_soft_max_f32(nc, st, s0, max);
|
||||||
uint16_t scvt; UNUSED(scvt);
|
|
||||||
for (int i = 0; i < nc; i++) {
|
|
||||||
if (s0[i] == -INFINITY) {
|
|
||||||
st[i] = 0.0f;
|
|
||||||
} else {
|
|
||||||
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
|
||||||
const float s = s0[i] - max;
|
|
||||||
const float val = expf(s);
|
|
||||||
#else
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
|
||||||
memcpy(&scvt, &s, sizeof(scvt));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
|
||||||
#endif
|
|
||||||
sum += (ggml_float)val;
|
|
||||||
st[i] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(sum > 0.0);
|
assert(sum > 0.0);
|
||||||
// sum = 1.0/sum;
|
|
||||||
}
|
|
||||||
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
|
||||||
sum = (1.0 - eps) / sum;
|
sum = (1.0 - eps) / sum;
|
||||||
|
|
||||||
|
// avoid log(0) by rescaling from [0..1] to [eps..1]
|
||||||
ggml_vec_scale_f32(nc, st, sum);
|
ggml_vec_scale_f32(nc, st, sum);
|
||||||
ggml_vec_add1_f32(nc, st, st, eps);
|
ggml_vec_add1_f32(nc, st, st, eps);
|
||||||
ggml_vec_log_f32(nc, st, st);
|
ggml_vec_log_f32(nc, st, st);
|
||||||
|
@ -17409,32 +17520,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// soft_max
|
// soft_max
|
||||||
ggml_float sum = 0.0;
|
|
||||||
{
|
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
ggml_vec_max_f32(nc, &max, s0);
|
ggml_vec_max_f32(nc, &max, s0);
|
||||||
|
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
|
||||||
uint16_t scvt; UNUSED(scvt);
|
|
||||||
for (int i = 0; i < nc; i++) {
|
|
||||||
if (s0[i] == -INFINITY) {
|
|
||||||
ds0[i] = 0.0f;
|
|
||||||
} else {
|
|
||||||
#ifndef GGML_CROSS_ENTROPY_EXP_FP16
|
|
||||||
const float s = s0[i] - max;
|
|
||||||
const float val = expf(s);
|
|
||||||
#else
|
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
|
|
||||||
memcpy(&scvt, &s, sizeof(scvt));
|
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
|
||||||
#endif
|
|
||||||
sum += (ggml_float)val;
|
|
||||||
ds0[i] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(sum > 0.0);
|
assert(sum > 0.0);
|
||||||
sum = (1.0 - eps) / sum;
|
sum = (1.0 - eps) / sum;
|
||||||
}
|
|
||||||
|
|
||||||
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
|
||||||
ggml_vec_scale_f32(nc, ds0, sum);
|
ggml_vec_scale_f32(nc, ds0, sum);
|
||||||
|
|
29
llama.cpp
29
llama.cpp
|
@ -6728,6 +6728,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
|
||||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||||
cb(q, "q", il);
|
cb(q, "q", il);
|
||||||
|
@ -6750,8 +6751,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
struct ggml_tensor * v =
|
struct ggml_tensor * v =
|
||||||
ggml_view_3d(ctx, kv.v_l[il],
|
ggml_view_3d(ctx, kv.v_l[il],
|
||||||
n_embd_head_v, n_kv, n_head_kv,
|
n_embd_head_v, n_kv, n_head_kv,
|
||||||
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
||||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
||||||
0);
|
0);
|
||||||
cb(v, "v", il);
|
cb(v, "v", il);
|
||||||
|
|
||||||
|
@ -6761,7 +6762,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
||||||
} else {
|
} else {
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||||
cb(kq, "kq", il);
|
cb(kq, "kq", il);
|
||||||
|
@ -13149,6 +13150,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||||
|
LLAMA_LOG_WARN(
|
||||||
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||||
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||||
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||||
|
}
|
||||||
|
|
||||||
if (add_special && vocab.special_add_eos == 1) {
|
if (add_special && vocab.special_add_eos == 1) {
|
||||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
|
@ -13175,6 +13183,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||||
|
LLAMA_LOG_WARN(
|
||||||
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||||
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||||
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||||
|
}
|
||||||
|
|
||||||
if (add_special && vocab.special_add_eos == 1) {
|
if (add_special && vocab.special_add_eos == 1) {
|
||||||
GGML_ASSERT(vocab.special_add_eos != -1);
|
GGML_ASSERT(vocab.special_add_eos != -1);
|
||||||
output.push_back(vocab.special_eos_id);
|
output.push_back(vocab.special_eos_id);
|
||||||
|
@ -14235,9 +14250,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||||
|
|
||||||
// Sample the next word X using top-k sampling
|
// Sample the next word X using top-k sampling
|
||||||
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
||||||
if (ctx) {
|
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
|
||||||
llama_token X = llama_sample_token(ctx, candidates);
|
llama_token X = llama_sample_token(ctx, candidates);
|
||||||
t_start_sample_us = ggml_time_us();
|
t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -14251,9 +14264,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||||
// Update mu using the learning rate and error
|
// Update mu using the learning rate and error
|
||||||
*mu = *mu - eta * e;
|
*mu = *mu - eta * e;
|
||||||
|
|
||||||
if (ctx) {
|
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
|
||||||
return X;
|
return X;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17347,13 +17358,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (cell_range_begin != kv_self.size) {
|
if (cell_range_begin != kv_self.size) {
|
||||||
cell_ranges.push_back({ cell_range_begin, i });
|
cell_ranges.emplace_back(cell_range_begin, i);
|
||||||
cell_range_begin = kv_self.size;
|
cell_range_begin = kv_self.size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (cell_range_begin != kv_self.size) {
|
if (cell_range_begin != kv_self.size) {
|
||||||
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||||
|
|
|
@ -1,45 +1,142 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
test_suite=${1:-}
|
|
||||||
test_number=${2:-}
|
|
||||||
|
|
||||||
PROG=${0##*/}
|
PROG=${0##*/}
|
||||||
build_dir="build-ci-debug"
|
build_dir="build-ci-debug"
|
||||||
|
|
||||||
|
# Print Color Commands
|
||||||
|
red=$(tput setaf 1)
|
||||||
|
green=$(tput setaf 2)
|
||||||
|
yellow=$(tput setaf 3)
|
||||||
|
blue=$(tput setaf 4)
|
||||||
|
magenta=$(tput setaf 5)
|
||||||
|
cyan=$(tput setaf 6)
|
||||||
|
normal=$(tput sgr0)
|
||||||
|
|
||||||
|
|
||||||
|
# Print Help Message
|
||||||
|
####################
|
||||||
|
|
||||||
|
print_full_help() {
|
||||||
|
cat << EOF
|
||||||
|
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||||
|
Debug specific ctest program.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-h, --help display this help and exit
|
||||||
|
-g run in gdb mode
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
<test_regex> (Mandatory) Supply one regex to the script to filter tests
|
||||||
|
(test_number) (Optional) Test number to run a specific test
|
||||||
|
|
||||||
|
Example:
|
||||||
|
$PROG test-tokenizer
|
||||||
|
$PROG test-tokenizer 3
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
abort() {
|
||||||
|
echo "Error: $1" >&2
|
||||||
|
cat << EOF >&2
|
||||||
|
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||||
|
Debug specific ctest program.
|
||||||
|
Refer to --help for full instructions.
|
||||||
|
EOF
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Dependency Sanity Check
|
||||||
|
#########################
|
||||||
|
|
||||||
|
check_dependency() {
|
||||||
|
command -v "$1" >/dev/null 2>&1 || {
|
||||||
|
abort "$1 is required but not found. Please install it and try again."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
check_dependency ctest
|
||||||
|
check_dependency cmake
|
||||||
|
|
||||||
|
|
||||||
|
# Step 0: Check the args
|
||||||
|
########################
|
||||||
|
|
||||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
print_full_help >&2
|
||||||
echo "Debug specific ctest program."
|
|
||||||
echo
|
|
||||||
echo "Options:"
|
|
||||||
echo " -h, --help Display this help and exit"
|
|
||||||
echo
|
|
||||||
echo "Arguments:"
|
|
||||||
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
|
||||||
echo " (test_number) (Optional) Test number to run a specific test"
|
|
||||||
echo
|
|
||||||
echo "Example:"
|
|
||||||
echo " $PROG test-tokenizer"
|
|
||||||
echo " $PROG test-tokenizer 3"
|
|
||||||
echo
|
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Function to select and debug a test
|
# Parse command-line options
|
||||||
function select_test() {
|
gdb_mode=false
|
||||||
test_suite=${1:-test}
|
while getopts "g" opt; do
|
||||||
test_number=${2:-}
|
case $opt in
|
||||||
|
g)
|
||||||
|
gdb_mode=true
|
||||||
|
echo "gdb_mode Mode Enabled"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
# Sanity Check If Tests Is Detected
|
# Shift the option parameters
|
||||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
shift $((OPTIND - 1))
|
||||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
|
||||||
if [ ${#tests[@]} -eq 0 ]
|
# Positionial Argument Processing : <test_regex>
|
||||||
then
|
if [ -z "${1}" ]; then
|
||||||
echo "No tests avaliable... check your compliation process..."
|
abort "Test regex is required"
|
||||||
echo "Exiting."
|
else
|
||||||
exit 1
|
test_suite=${1:-}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z $test_number ]
|
# Positionial Argument Processing : (test_number)
|
||||||
then
|
test_number=${2:-}
|
||||||
|
|
||||||
|
|
||||||
|
# Step 1: Reset and Setup folder context
|
||||||
|
########################################
|
||||||
|
|
||||||
|
## Sanity check that we are actually in a git repo
|
||||||
|
repo_root=$(git rev-parse --show-toplevel)
|
||||||
|
if [ ! -d "$repo_root" ]; then
|
||||||
|
abort "Not in a Git repository."
|
||||||
|
fi
|
||||||
|
|
||||||
|
## Reset folder to root context of git repo and Create and enter build directory
|
||||||
|
pushd "$repo_root"
|
||||||
|
rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
|
||||||
|
|
||||||
|
|
||||||
|
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||||
|
###########################################################
|
||||||
|
|
||||||
|
# Note: test-eval-callback requires -DLLAMA_CURL
|
||||||
|
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
|
||||||
|
pushd "$build_dir"
|
||||||
|
make -j || abort "Failed to compile"
|
||||||
|
popd > /dev/null || exit 1
|
||||||
|
|
||||||
|
|
||||||
|
# Step 3: Find all tests available that matches REGEX
|
||||||
|
####################################################
|
||||||
|
|
||||||
|
# Ctest Gather Tests
|
||||||
|
# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||||
|
# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||||
|
# `-V` : Verbose Mode
|
||||||
|
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||||
|
pushd "$build_dir"
|
||||||
|
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||||
|
if [ ${#tests[@]} -eq 0 ]; then
|
||||||
|
abort "No tests avaliable... check your compliation process..."
|
||||||
|
fi
|
||||||
|
popd > /dev/null || exit 1
|
||||||
|
|
||||||
|
|
||||||
|
# Step 4: Identify Test Command for Debugging
|
||||||
|
#############################################
|
||||||
|
|
||||||
|
# Select test number
|
||||||
|
if [ -z $test_number ]; then
|
||||||
# List out avaliable tests
|
# List out avaliable tests
|
||||||
printf "Which test would you like to debug?\n"
|
printf "Which test would you like to debug?\n"
|
||||||
id=0
|
id=0
|
||||||
|
@ -53,65 +150,54 @@ function select_test() {
|
||||||
# Prompt user which test they wanted to run
|
# Prompt user which test they wanted to run
|
||||||
printf "\nRun test#? "
|
printf "\nRun test#? "
|
||||||
read test_number
|
read test_number
|
||||||
|
|
||||||
else
|
else
|
||||||
printf "\nUser Already Requested #${test_number}"
|
printf "\nUser Already Requested #${test_number}\n"
|
||||||
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Start GDB with the requested test binary and arguments
|
# Grab all tests commands
|
||||||
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
pushd "$build_dir"
|
||||||
# Change IFS (Internal Field Separator)
|
sIFS=$IFS # Save Initial IFS (Internal Field Separator)
|
||||||
sIFS=$IFS
|
IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
|
||||||
IFS=$'\n'
|
test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
|
||||||
|
IFS=$sIFS # Reset IFS (Internal Field Separator)
|
||||||
|
popd > /dev/null || exit 1
|
||||||
|
|
||||||
# Get test args
|
# Grab specific test command
|
||||||
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
single_test_name="${tests[test_number]}"
|
||||||
IFS=$sIFS
|
single_test_command="${test_args[test_number]}"
|
||||||
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
|
||||||
|
|
||||||
# Expand paths if needed
|
|
||||||
args=()
|
|
||||||
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
|
||||||
do
|
|
||||||
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
|
||||||
done
|
|
||||||
|
|
||||||
|
# Step 5: Execute or GDB Debug
|
||||||
|
##############################
|
||||||
|
|
||||||
|
printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
|
||||||
|
printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
|
||||||
|
|
||||||
|
if [ "$gdb_mode" = "true" ]; then
|
||||||
# Execute debugger
|
# Execute debugger
|
||||||
echo "gdb args: ${args[@]}"
|
|
||||||
gdb --args ${args[@]}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Step 0: Check the args
|
|
||||||
if [ -z "$test_suite" ]
|
|
||||||
then
|
|
||||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
|
||||||
echo "Supply one regex to the script to filter tests,"
|
|
||||||
echo "and optionally a test number to run a specific test."
|
|
||||||
echo "Use --help flag for full instructions"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Step 1: Reset and Setup folder context
|
|
||||||
## Sanity check that we are actually in a git repo
|
|
||||||
repo_root=$(git rev-parse --show-toplevel)
|
|
||||||
if [ ! -d "$repo_root" ]; then
|
|
||||||
echo "Error: Not in a Git repository."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
## Reset folder to root context of git repo
|
|
||||||
pushd "$repo_root" || exit 1
|
pushd "$repo_root" || exit 1
|
||||||
|
eval "gdb --args ${single_test_command}"
|
||||||
|
popd > /dev/null || exit 1
|
||||||
|
|
||||||
## Create and enter build directory
|
else
|
||||||
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
# Execute Test
|
||||||
|
pushd "$repo_root" || exit 1
|
||||||
|
eval "${single_test_command}"
|
||||||
|
exit_code=$?
|
||||||
|
popd > /dev/null || exit 1
|
||||||
|
|
||||||
# Step 2: Setup Build Environment and Compile Test Binaries
|
# Print Result
|
||||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
|
||||||
pushd "$build_dir" && make -j || exit 1
|
printf "${yellow}Command: ${single_test_command}${normal}\n"
|
||||||
|
if [ $exit_code -eq 0 ]; then
|
||||||
|
printf "${green}TEST PASS${normal}\n"
|
||||||
|
else
|
||||||
|
printf "${red}TEST FAIL${normal}\n"
|
||||||
|
fi
|
||||||
|
|
||||||
# Step 3: Debug the Test
|
fi
|
||||||
select_test "$test_suite" "$test_number"
|
|
||||||
|
|
||||||
# Step 4: Return to the directory from which the user ran the command.
|
# Return to the directory from which the user ran the command.
|
||||||
popd || exit 1
|
popd > /dev/null || exit 1
|
||||||
popd || exit 1
|
|
||||||
popd || exit 1
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue